From a69ddd52a74d7c1395c93bd4f862ec1e4366c75c Mon Sep 17 00:00:00 2001 From: Stefan Roiser Date: Fri, 21 Nov 2025 10:25:12 +0100 Subject: [PATCH 01/18] merge master --- .../template_files/gpu/MemoryAccessGs.h | 8 +++ .../iolibs/template_files/gpu/MemoryBuffers.h | 4 +- .../iolibs/template_files/gpu/processConfig.h | 16 +++++ .../iolibs/template_files/gpu/process_cc.inc | 1 + .../gpu/process_function_definitions.inc | 70 +++++++++++++++++-- .../gpu/process_sigmaKin_function.inc | 56 +++++---------- .../CUDACPP_SA_OUTPUT/model_handling.py | 24 +++++-- 7 files changed, 132 insertions(+), 47 deletions(-) create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/processConfig.h diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h index 936ef7a7ff..8893f065ea 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_%(model_name)s.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/processConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/processConfig.h new file mode 100644 index 0000000000..a4777347d0 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_%(processid_uppercase)s_H +#define MG5_CONFIG_%(processid_uppercase)s_H 1 + +namespace processConfig { + + constexpr int ndiagrams = %(ndiagrams)d; + +} + +#endif // MG5_CONFIG_%(processid_uppercase)s_H \ No newline at end of file diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc index 4c35c3eec6..d742565283 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc @@ -25,6 +25,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 0665bfb93b..80b239bfc3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -16,9 +16,72 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] @@ -714,7 +777,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index 4372edde52..c4c8d93314 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -31,11 +31,12 @@ fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -50,6 +51,15 @@ // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s +/* sr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + unsigned int channelId = getChannelId( allChannelIds ); +#endif + // Running sum of partial amplitudes squared for event by event color selection (#402) + // (for the single event processed in calculate_wavefunctions) + fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; + fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) +sr */ for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; @@ -110,38 +120,7 @@ const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -320,11 +299,12 @@ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 3f8a85afa6..544631373b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -1451,6 +1451,7 @@ def generate_process_files(self): self.edit_check_sa() self.edit_mgonGPU() self.edit_processidfile() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses) + self.edit_processConfig() # sub process specific, not to be symlinked from the Subprocesses directory self.edit_colorsum() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses) self.edit_testxxx() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific) self.edit_memorybuffers() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific) @@ -1543,6 +1544,17 @@ def edit_colorsum(self): ff = open(pjoin(self.path, 'color_sum.cc'),'w') ff.write(template % replace_dict) ff.close() + + def edit_processConfig(self): + """Generate process_config.h""" + ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_processConfig') + template = open(pjoin(self.template_path,'gpu','processConfig.h'),'r').read() + replace_dict = {} + replace_dict['ndiagrams'] = len(self.matrix_elements[0].get('diagrams')) + replace_dict['processid_uppercase'] = self.get_process_name().upper() + ff = open(pjoin(self.path, 'processConfig.h'),'w') + ff.write(template % replace_dict) + ff.close() def generate_subprocess_directory_end(self, **opt): """ opt contain all local variable of the fortran original function""" @@ -1939,7 +1951,7 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -1953,7 +1965,7 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif""") diagrams = matrix_element.get('diagrams') @@ -1985,8 +1997,12 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % diag_to_config[id_amp]) # BUG #472 ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % id_amp) # wrong fix for BUG #472 res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL") - res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % diagram.get('number')) - res.append("if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );") + diagnum = diagram.get('number') + res.append("if( channelId != 0 )") + res.append("{") + res.append(" numerators_sv[%i] += cxabs2( amp_sv[0] );" % (diagnum-1)) + res.append(" denominators_sv += cxabs2( amp_sv[0] );") + res.append("}") res.append("#endif") else: res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL") From bbe8ef2fe06133a5ea1a5af5132279004f785280 Mon Sep 17 00:00:00 2001 From: Stefan Roiser Date: Fri, 21 Nov 2025 10:57:11 +0100 Subject: [PATCH 02/18] remove -l from bash call --- epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index e54290d5a7..f2d7189ddd 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -264,7 +264,7 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag): done""" try: result = subprocess.run( - ["bash", "-lc", patch_coupl_write], + ["bash", "-c", patch_coupl_write], cwd=pjoin(self.dir_path, "Source", "MODEL"), text=True, capture_output=True, From 637499a37328400820c39f73a11cf85930f01499 Mon Sep 17 00:00:00 2001 From: Stefan Roiser Date: Fri, 21 Nov 2025 11:21:41 +0100 Subject: [PATCH 03/18] formatting changes --- .../template_files/gpu/process_function_definitions.inc | 6 +++--- .../template_files/gpu/process_sigmaKin_function.inc | 9 --------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 80b239bfc3..525ab3a34c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -79,9 +79,9 @@ namespace mg5amcCpu } #endif // MGONGPU_SUPPORTS_MULTICHANNEL - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index c4c8d93314..d47ee7da4d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -51,15 +51,6 @@ // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s -/* sr -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds ); -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) -sr */ for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; From 5cb61b0688882526c4c5c20185bbcad77888240c Mon Sep 17 00:00:00 2001 From: Stefan Roiser Date: Fri, 21 Nov 2025 16:50:30 +0100 Subject: [PATCH 04/18] regenerate all processes --- .../SubProcesses/P1_epem_mupmum/processConfig.h | 16 ++++++++++++++++ .../P1_Sigma_sm_epem_mupmum/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gg_ttx/processConfig.h | 16 ++++++++++++++++ .../P1_Sigma_sm_gg_ttx/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gg_ttx/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P2_gg_ttxg/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gg_ttxg/processConfig.h | 16 ++++++++++++++++ .../P1_Sigma_sm_gg_ttxg/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gg_ttxgg/processConfig.h | 16 ++++++++++++++++ .../P1_Sigma_sm_gg_ttxgg/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gg_ttxggg/processConfig.h | 16 ++++++++++++++++ .../P1_Sigma_sm_gg_ttxggg/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gu_ttxu/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gux_ttxux/processConfig.h | 16 ++++++++++++++++ .../P1_Sigma_sm_gu_ttxu/processConfig.h | 16 ++++++++++++++++ .../P1_Sigma_sm_gux_ttxux/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gg_bbx/processConfig.h | 16 ++++++++++++++++ .../P1_Sigma_heft_gg_bbx/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P0_dux_ttxwm/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P0_udx_ttxwp/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_dux_ttxwmg/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gd_ttxwmu/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gdx_ttxwpux/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gu_ttxwpd/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gux_ttxwmdx/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_udx_ttxwpg/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P0_gg_ttx/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P0_uux_ttx/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gg_ttxg/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gu_ttxu/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gux_ttxux/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_uux_ttxg/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P2_gg_ttxgg/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P2_gg_ttxuux/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P2_gu_ttxgu/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P2_gux_ttxgux/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P2_uc_ttxuc/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P2_ucx_ttxucx/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P2_uu_ttxuu/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P2_uux_ttxccx/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P2_uux_ttxgg/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P2_uux_ttxuux/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P2_uxcx_ttxuxcx/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P2_uxux_ttxuxux/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gg_ttxttx/processConfig.h | 16 ++++++++++++++++ .../processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gg_t1t1x/processConfig.h | 16 ++++++++++++++++ .../P1_Sigma_MSSM_SLHA2_gg_t1t1x/processConfig.h | 16 ++++++++++++++++ .../SubProcesses/P1_gg_ttx/processConfig.h | 16 ++++++++++++++++ .../P1_Sigma_MSSM_SLHA2_gg_ttx/processConfig.h | 16 ++++++++++++++++ 50 files changed, 800 insertions(+) create mode 100644 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/processConfig.h create mode 100644 epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/processConfig.h create mode 100644 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h create mode 100644 epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/processConfig.h create mode 100644 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/processConfig.h create mode 100644 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/processConfig.h create mode 100644 epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/processConfig.h create mode 100644 epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/processConfig.h create mode 100644 epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/processConfig.h create mode 100644 epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/processConfig.h create mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/processConfig.h create mode 100644 epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/processConfig.h create mode 100644 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/processConfig.h create mode 100644 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/processConfig.h create mode 100644 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/processConfig.h create mode 100644 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/processConfig.h create mode 100644 epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/processConfig.h create mode 100644 epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/processConfig.h create mode 100644 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/processConfig.h create mode 100644 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/processConfig.h create mode 100644 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/processConfig.h create mode 100644 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/processConfig.h create mode 100644 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/processConfig.h create mode 100644 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/processConfig.h create mode 100644 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/processConfig.h create mode 100644 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/processConfig.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/processConfig.h create mode 100644 epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/processConfig.h create mode 100644 epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/processConfig.h create mode 100644 epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/processConfig.h create mode 100644 epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/processConfig.h create mode 100644 epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/processConfig.h diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/processConfig.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/processConfig.h new file mode 100644 index 0000000000..65cfee8266 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H +#define MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 2; + +} + +#endif // MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/processConfig.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/processConfig.h new file mode 100644 index 0000000000..65cfee8266 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H +#define MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 2; + +} + +#endif // MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h new file mode 100644 index 0000000000..8f6a27f796 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTX_H +#define MG5_CONFIG_SIGMA_SM_GG_TTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 3; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTX_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/processConfig.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/processConfig.h new file mode 100644 index 0000000000..8f6a27f796 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTX_H +#define MG5_CONFIG_SIGMA_SM_GG_TTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 3; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTX_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/processConfig.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/processConfig.h new file mode 100644 index 0000000000..8f6a27f796 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTX_H +#define MG5_CONFIG_SIGMA_SM_GG_TTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 3; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTX_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/processConfig.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/processConfig.h new file mode 100644 index 0000000000..47044dbe6a --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 16; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXG_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/processConfig.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/processConfig.h new file mode 100644 index 0000000000..47044dbe6a --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 16; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXG_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/processConfig.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/processConfig.h new file mode 100644 index 0000000000..47044dbe6a --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 16; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXG_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/processConfig.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/processConfig.h new file mode 100644 index 0000000000..4f4a3c3bc0 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXGG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXGG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 123; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXGG_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/processConfig.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/processConfig.h new file mode 100644 index 0000000000..4f4a3c3bc0 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXGG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXGG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 123; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXGG_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/processConfig.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/processConfig.h new file mode 100644 index 0000000000..fe7af482a7 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 1240; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/processConfig.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/processConfig.h new file mode 100644 index 0000000000..fe7af482a7 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 1240; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/processConfig.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/processConfig.h new file mode 100644 index 0000000000..fe66e4e760 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GU_TTXU_H +#define MG5_CONFIG_SIGMA_SM_GU_TTXU_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 5; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GU_TTXU_H \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/processConfig.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/processConfig.h new file mode 100644 index 0000000000..89823b9d1d --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H +#define MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 5; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/processConfig.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/processConfig.h new file mode 100644 index 0000000000..fe66e4e760 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GU_TTXU_H +#define MG5_CONFIG_SIGMA_SM_GU_TTXU_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 5; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GU_TTXU_H \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/processConfig.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/processConfig.h new file mode 100644 index 0000000000..89823b9d1d --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H +#define MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 5; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/processConfig.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/processConfig.h new file mode 100644 index 0000000000..f7dbd383b0 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_HEFT_GG_BBX_H +#define MG5_CONFIG_SIGMA_HEFT_GG_BBX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 4; + +} + +#endif // MG5_CONFIG_SIGMA_HEFT_GG_BBX_H \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/processConfig.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/processConfig.h new file mode 100644 index 0000000000..f7dbd383b0 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_HEFT_GG_BBX_H +#define MG5_CONFIG_SIGMA_HEFT_GG_BBX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 4; + +} + +#endif // MG5_CONFIG_SIGMA_HEFT_GG_BBX_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/processConfig.h new file mode 100644 index 0000000000..4f350b6335 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWM_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWM_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 2; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWM_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/processConfig.h new file mode 100644 index 0000000000..38d2d5ed20 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWP_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWP_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 2; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWP_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/processConfig.h new file mode 100644 index 0000000000..743c903011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWMG_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWMG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 12; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWMG_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/processConfig.h new file mode 100644 index 0000000000..0861e7eec7 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_GD_TTXWMU_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_GD_TTXWMU_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 12; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_GD_TTXWMU_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/processConfig.h new file mode 100644 index 0000000000..2e039b079e --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_GDX_TTXWPUX_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_GDX_TTXWPUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 12; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_GDX_TTXWPUX_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/processConfig.h new file mode 100644 index 0000000000..8aa1915d04 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_GU_TTXWPD_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_GU_TTXWPD_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 12; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_GU_TTXWPD_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/processConfig.h new file mode 100644 index 0000000000..6724b700f9 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_GUX_TTXWMDX_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_GUX_TTXWMDX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 12; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_GUX_TTXWMDX_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/processConfig.h new file mode 100644 index 0000000000..59ab03987d --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWPG_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWPG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 12; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWPG_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/processConfig.h new file mode 100644 index 0000000000..8f6a27f796 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTX_H +#define MG5_CONFIG_SIGMA_SM_GG_TTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 3; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/processConfig.h new file mode 100644 index 0000000000..771b635b93 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UUX_TTX_H +#define MG5_CONFIG_SIGMA_SM_UUX_TTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 1; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UUX_TTX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/processConfig.h new file mode 100644 index 0000000000..47044dbe6a --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 16; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXG_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/processConfig.h new file mode 100644 index 0000000000..fe66e4e760 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GU_TTXU_H +#define MG5_CONFIG_SIGMA_SM_GU_TTXU_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 5; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GU_TTXU_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/processConfig.h new file mode 100644 index 0000000000..89823b9d1d --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H +#define MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 5; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/processConfig.h new file mode 100644 index 0000000000..8754e13596 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UUX_TTXG_H +#define MG5_CONFIG_SIGMA_SM_UUX_TTXG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 5; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UUX_TTXG_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/processConfig.h new file mode 100644 index 0000000000..4f4a3c3bc0 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXGG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXGG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 123; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXGG_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/processConfig.h new file mode 100644 index 0000000000..e51eb2c6c2 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXUUX_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXUUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 36; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXUUX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/processConfig.h new file mode 100644 index 0000000000..f7b8795d98 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GU_TTXGU_H +#define MG5_CONFIG_SIGMA_SM_GU_TTXGU_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 36; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GU_TTXGU_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/processConfig.h new file mode 100644 index 0000000000..94a05c400f --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GUX_TTXGUX_H +#define MG5_CONFIG_SIGMA_SM_GUX_TTXGUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 36; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GUX_TTXGUX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/processConfig.h new file mode 100644 index 0000000000..9a17b225fd --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UC_TTXUC_H +#define MG5_CONFIG_SIGMA_SM_UC_TTXUC_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 7; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UC_TTXUC_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/processConfig.h new file mode 100644 index 0000000000..f52e249e91 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UCX_TTXUCX_H +#define MG5_CONFIG_SIGMA_SM_UCX_TTXUCX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 7; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UCX_TTXUCX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/processConfig.h new file mode 100644 index 0000000000..8f10a6d734 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UU_TTXUU_H +#define MG5_CONFIG_SIGMA_SM_UU_TTXUU_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 14; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UU_TTXUU_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/processConfig.h new file mode 100644 index 0000000000..2dfae1920f --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UUX_TTXCCX_H +#define MG5_CONFIG_SIGMA_SM_UUX_TTXCCX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 7; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UUX_TTXCCX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/processConfig.h new file mode 100644 index 0000000000..25081a00da --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UUX_TTXGG_H +#define MG5_CONFIG_SIGMA_SM_UUX_TTXGG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 36; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UUX_TTXGG_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/processConfig.h new file mode 100644 index 0000000000..e6c319de76 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UUX_TTXUUX_H +#define MG5_CONFIG_SIGMA_SM_UUX_TTXUUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 14; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UUX_TTXUUX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/processConfig.h new file mode 100644 index 0000000000..7adcbeb7fc --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UXCX_TTXUXCX_H +#define MG5_CONFIG_SIGMA_SM_UXCX_TTXUXCX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 7; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UXCX_TTXUXCX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/processConfig.h new file mode 100644 index 0000000000..368fc584e6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UXUX_TTXUXUX_H +#define MG5_CONFIG_SIGMA_SM_UXUX_TTXUXUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 14; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UXUX_TTXUXUX_H \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/processConfig.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/processConfig.h new file mode 100644 index 0000000000..b9f07de180 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H +#define MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 72; + +} + +#endif // MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/processConfig.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/processConfig.h new file mode 100644 index 0000000000..b9f07de180 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H +#define MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 72; + +} + +#endif // MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/processConfig.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/processConfig.h new file mode 100644 index 0000000000..998cb0ade6 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H +#define MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 6; + +} + +#endif // MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/processConfig.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/processConfig.h new file mode 100644 index 0000000000..998cb0ade6 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H +#define MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 6; + +} + +#endif // MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h new file mode 100644 index 0000000000..04a79dca0d --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H +#define MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 3; + +} + +#endif // MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/processConfig.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/processConfig.h new file mode 100644 index 0000000000..04a79dca0d --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H +#define MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 3; + +} + +#endif // MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H \ No newline at end of file From 8a384f1259b68017321e66e6a6f3d9bfdd86d8bf Mon Sep 17 00:00:00 2001 From: Stefan Roiser Date: Fri, 21 Nov 2025 16:51:14 +0100 Subject: [PATCH 05/18] regenerate all processes --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 65 +- .../ee_mumu.mad/Cards/me5_configuration.txt | 4 +- epochX/cudacpp/ee_mumu.mad/Source/make_opts | 2 +- .../ee_mumu.mad/SubProcesses/MemoryAccessGs.h | 8 + .../ee_mumu.mad/SubProcesses/MemoryBuffers.h | 4 +- .../SubProcesses/P1_epem_mupmum/CPPProcess.cc | 130 +- .../CODEGEN_cudacpp_ee_mumu_log.txt | 38 +- .../ee_mumu.sa/SubProcesses/MemoryAccessGs.h | 8 + .../ee_mumu.sa/SubProcesses/MemoryBuffers.h | 4 +- .../P1_Sigma_sm_epem_mupmum/CPPProcess.cc | 116 +- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 67 +- .../gg_tt.mad/Cards/me5_configuration.txt | 4 +- epochX/cudacpp/gg_tt.mad/Source/make_opts | 2 +- .../gg_tt.mad/SubProcesses/MemoryAccessGs.h | 8 + .../gg_tt.mad/SubProcesses/MemoryBuffers.h | 4 +- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 137 +- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 39 +- .../gg_tt.sa/SubProcesses/MemoryAccessGs.h | 8 + .../gg_tt.sa/SubProcesses/MemoryBuffers.h | 4 +- .../P1_Sigma_sm_gg_ttx/CPPProcess.cc | 116 +- .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 76 +- .../gg_tt01g.mad/Cards/me5_configuration.txt | 4 +- epochX/cudacpp/gg_tt01g.mad/Source/make_opts | 2 +- .../SubProcesses/MemoryAccessGs.h | 8 + .../gg_tt01g.mad/SubProcesses/MemoryBuffers.h | 4 +- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 137 +- .../SubProcesses/P2_gg_ttxg/CPPProcess.cc | 221 +- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 67 +- .../gg_ttg.mad/Cards/me5_configuration.txt | 4 +- epochX/cudacpp/gg_ttg.mad/Source/make_opts | 2 +- .../gg_ttg.mad/SubProcesses/MemoryAccessGs.h | 8 + .../gg_ttg.mad/SubProcesses/MemoryBuffers.h | 4 +- .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 221 +- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 39 +- .../gg_ttg.sa/SubProcesses/MemoryAccessGs.h | 8 + .../gg_ttg.sa/SubProcesses/MemoryBuffers.h | 4 +- .../P1_Sigma_sm_gg_ttxg/CPPProcess.cc | 116 +- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 67 +- .../gg_ttgg.mad/Cards/me5_configuration.txt | 4 +- epochX/cudacpp/gg_ttgg.mad/Source/make_opts | 2 +- .../gg_ttgg.mad/SubProcesses/MemoryAccessGs.h | 8 + .../gg_ttgg.mad/SubProcesses/MemoryBuffers.h | 4 +- .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc | 851 ++- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 37 +- .../gg_ttgg.sa/SubProcesses/MemoryAccessGs.h | 8 + .../gg_ttgg.sa/SubProcesses/MemoryBuffers.h | 4 +- .../P1_Sigma_sm_gg_ttxgg/CPPProcess.cc | 116 +- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 69 +- .../gg_ttggg.mad/Cards/me5_configuration.txt | 4 +- epochX/cudacpp/gg_ttggg.mad/Source/make_opts | 2 +- .../SubProcesses/MemoryAccessGs.h | 8 + .../gg_ttggg.mad/SubProcesses/MemoryBuffers.h | 4 +- .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc | 6733 ++++++++++++----- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 39 +- .../gg_ttggg.sa/SubProcesses/MemoryAccessGs.h | 8 + .../gg_ttggg.sa/SubProcesses/MemoryBuffers.h | 4 +- .../P1_Sigma_sm_gg_ttxggg/CPPProcess.cc | 116 +- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 73 +- .../gq_ttq.mad/Cards/me5_configuration.txt | 4 +- epochX/cudacpp/gq_ttq.mad/Source/make_opts | 2 +- .../gq_ttq.mad/SubProcesses/MemoryAccessGs.h | 8 + .../gq_ttq.mad/SubProcesses/MemoryBuffers.h | 4 +- .../SubProcesses/P1_gu_ttxu/CPPProcess.cc | 151 +- .../SubProcesses/P1_gux_ttxux/CPPProcess.cc | 151 +- .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 47 +- .../gq_ttq.sa/SubProcesses/MemoryAccessGs.h | 8 + .../gq_ttq.sa/SubProcesses/MemoryBuffers.h | 4 +- .../P1_Sigma_sm_gu_ttxu/CPPProcess.cc | 116 +- .../P1_Sigma_sm_gux_ttxux/CPPProcess.cc | 116 +- .../CODEGEN_mad_heft_gg_bb_log.txt | 65 +- .../Cards/me5_configuration.txt | 4 +- .../cudacpp/heft_gg_bb.mad/Source/make_opts | 2 +- .../SubProcesses/MemoryAccessGs.h | 8 + .../SubProcesses/MemoryBuffers.h | 4 +- .../SubProcesses/P1_gg_bbx/CPPProcess.cc | 144 +- .../CODEGEN_cudacpp_heft_gg_bb_log.txt | 85 +- .../SubProcesses/MemoryAccessGs.h | 8 + .../SubProcesses/MemoryBuffers.h | 4 +- .../P1_Sigma_heft_gg_bbx/CPPProcess.cc | 116 +- .../CODEGEN_mad_nobm_pp_ttW_log.txt | 111 +- .../Cards/me5_configuration.txt | 4 +- .../cudacpp/nobm_pp_ttW.mad/Source/make_opts | 2 +- .../SubProcesses/MemoryAccessGs.h | 8 + .../SubProcesses/MemoryBuffers.h | 4 +- .../SubProcesses/P0_dux_ttxwm/CPPProcess.cc | 130 +- .../SubProcesses/P0_udx_ttxwp/CPPProcess.cc | 130 +- .../SubProcesses/P1_dux_ttxwmg/CPPProcess.cc | 200 +- .../SubProcesses/P1_gd_ttxwmu/CPPProcess.cc | 200 +- .../SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc | 200 +- .../SubProcesses/P1_gu_ttxwpd/CPPProcess.cc | 200 +- .../SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc | 200 +- .../SubProcesses/P1_udx_ttxwpg/CPPProcess.cc | 200 +- .../CODEGEN_mad_pp_tt012j_log.txt | 173 +- .../pp_tt012j.mad/Cards/me5_configuration.txt | 4 +- epochX/cudacpp/pp_tt012j.mad/Source/make_opts | 2 +- .../SubProcesses/MemoryAccessGs.h | 8 + .../SubProcesses/MemoryBuffers.h | 4 +- .../SubProcesses/P0_gg_ttx/CPPProcess.cc | 137 +- .../SubProcesses/P0_uux_ttx/CPPProcess.cc | 123 +- .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 221 +- .../SubProcesses/P1_gu_ttxu/CPPProcess.cc | 151 +- .../SubProcesses/P1_gux_ttxux/CPPProcess.cc | 151 +- .../SubProcesses/P1_uux_ttxg/CPPProcess.cc | 151 +- .../SubProcesses/P2_gg_ttxgg/CPPProcess.cc | 851 ++- .../SubProcesses/P2_gg_ttxuux/CPPProcess.cc | 361 +- .../SubProcesses/P2_gu_ttxgu/CPPProcess.cc | 361 +- .../SubProcesses/P2_gux_ttxgux/CPPProcess.cc | 361 +- .../SubProcesses/P2_uc_ttxuc/CPPProcess.cc | 165 +- .../SubProcesses/P2_ucx_ttxucx/CPPProcess.cc | 165 +- .../SubProcesses/P2_uu_ttxuu/CPPProcess.cc | 214 +- .../SubProcesses/P2_uux_ttxccx/CPPProcess.cc | 165 +- .../SubProcesses/P2_uux_ttxgg/CPPProcess.cc | 361 +- .../SubProcesses/P2_uux_ttxuux/CPPProcess.cc | 214 +- .../P2_uxcx_ttxuxcx/CPPProcess.cc | 165 +- .../P2_uxux_ttxuxux/CPPProcess.cc | 214 +- .../CODEGEN_mad_smeft_gg_tttt_log.txt | 69 +- .../smeft_gg_tttt.mad/Cards/ident_card.dat | 40 +- .../Cards/me5_configuration.txt | 4 +- .../smeft_gg_tttt.mad/Source/make_opts | 2 +- .../smeft_gg_tttt.mad/Source/param_card.inc | 40 +- .../SubProcesses/MemoryAccessGs.h | 8 + .../SubProcesses/MemoryBuffers.h | 4 +- .../SubProcesses/P1_gg_ttxttx/CPPProcess.cc | 606 +- .../CODEGEN_cudacpp_smeft_gg_tttt_log.txt | 79 +- .../SubProcesses/MemoryAccessGs.h | 8 + .../SubProcesses/MemoryBuffers.h | 4 +- .../CPPProcess.cc | 116 +- .../CODEGEN_mad_susy_gg_t1t1_log.txt | 65 +- .../susy_gg_t1t1.mad/Cards/ident_card.dat | 2 +- .../Cards/me5_configuration.txt | 4 +- .../cudacpp/susy_gg_t1t1.mad/Source/make_opts | 2 +- .../susy_gg_t1t1.mad/Source/param_card.inc | 2 +- .../SubProcesses/MemoryAccessGs.h | 8 + .../SubProcesses/MemoryBuffers.h | 4 +- .../SubProcesses/P1_gg_t1t1x/CPPProcess.cc | 151 +- .../CODEGEN_cudacpp_susy_gg_t1t1_log.txt | 37 +- .../SubProcesses/MemoryAccessGs.h | 8 + .../SubProcesses/MemoryBuffers.h | 4 +- .../CPPProcess.cc | 116 +- .../CODEGEN_mad_susy_gg_tt_log.txt | 65 +- .../susy_gg_tt.mad/Cards/ident_card.dat | 2 +- .../Cards/me5_configuration.txt | 4 +- .../cudacpp/susy_gg_tt.mad/Source/make_opts | 2 +- .../susy_gg_tt.mad/Source/param_card.inc | 2 +- .../SubProcesses/MemoryAccessGs.h | 8 + .../SubProcesses/MemoryBuffers.h | 4 +- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 137 +- .../CODEGEN_cudacpp_susy_gg_tt_log.txt | 38 +- .../SubProcesses/MemoryAccessGs.h | 8 + .../SubProcesses/MemoryBuffers.h | 4 +- .../P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc | 116 +- 151 files changed, 12846 insertions(+), 6143 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index b7cdf09c17..7d83fbc2f6 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004445075988769531  +DEBUG: model prefixing takes 0.0018742084503173828  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -149,7 +150,7 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.003 s +1 processes with 2 diagrams generated in 0.006 s Total: 1 processes with 2 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -160,10 +161,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vecto INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 @@ -175,22 +176,22 @@ FileWriter mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1589]  Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.060 s +Wrote files for 8 helas calls in 1.223 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.170 s +ALOHA: aloha creates 3 routines in 0.120 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.184 s +ALOHA: aloha creates 7 routines in 0.120 s FFV1 FFV1 FFV2 @@ -199,32 +200,34 @@ ALOHA: aloha creates 7 routines in 0.184 s FFV4 FFV2_4 FFV2_4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m2.396s -user 0m1.798s -sys 0m0.425s -Code generation completed in 2 seconds +real 0m6.564s +user 0m1.276s +sys 0m0.649s +Code generation completed in 7 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -245,9 +248,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -274,9 +277,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/ee_mumu.mad/Source/make_opts b/epochX/cudacpp/ee_mumu.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/make_opts +++ b/epochX/cudacpp/ee_mumu.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc index 2450ec54f8..7f07184332 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -391,8 +455,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -404,8 +471,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV2_4_0( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -1049,7 +1119,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1079,11 +1148,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1158,38 +1228,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1368,11 +1407,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 3c991f09cf..0312305458 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -1,8 +1,8 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) Running MG5 in debug mode -('WARNING: loading of madgraph too slow!!!', 1.185530662536621) Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT ************************************************************ * * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004302024841308594  +DEBUG: model prefixing takes 0.0028181076049804688  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,13 +150,13 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.003 s +1 processes with 2 diagrams generated in 0.006 s Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 @@ -165,17 +165,17 @@ INFO: Processing color information for process: e+ e- > mu+ mu- @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. -Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. +Generated helas calls for 1 subprocesses (2 diagrams) in 0.005 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.190 s +ALOHA: aloha creates 4 routines in 0.116 s FFV1 FFV1 FFV2 @@ -184,17 +184,17 @@ ALOHA: aloha creates 4 routines in 0.190 s FFV4 FFV2_4 FFV2_4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m1.709s -user 0m1.562s -sys 0m0.115s +real 0m1.506s +user 0m0.432s +sys 0m0.135s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index af61f3ea74..e030cc38c8 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -1047,7 +1111,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1077,11 +1140,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1156,38 +1220,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1366,11 +1399,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 156f7ce8e7..f31e287e32 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004584789276123047  +DEBUG: model prefixing takes 0.0018684864044189453  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.007 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -161,10 +162,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_ INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -176,49 +177,51 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.009 s -Wrote files for 10 helas calls in 0.078 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s +Wrote files for 10 helas calls in 0.925 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.103 s +ALOHA: aloha creates 2 routines in 0.098 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.088 s +ALOHA: aloha creates 4 routines in 0.082 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m2.028s -user 0m1.664s -sys 0m0.358s -Code generation completed in 2 seconds +real 0m5.871s +user 0m1.183s +sys 0m0.587s +Code generation completed in 6 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -239,9 +242,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -268,9 +271,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt.mad/Source/make_opts b/epochX/cudacpp/gg_tt.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/make_opts +++ b/epochX/cudacpp/gg_tt.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 55167ebaf3..65712c3058 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -391,8 +455,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -405,8 +472,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -418,8 +488,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -1062,7 +1135,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1092,11 +1164,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1171,38 +1244,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1381,11 +1423,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 1f90d3c408..3410a9d9e8 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004430294036865234  +DEBUG: model prefixing takes 0.0018236637115478516  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,13 +151,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.006 s +1 processes with 3 diagrams generated in 0.007 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -165,30 +166,30 @@ INFO: Processing color information for process: g g > t t~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. -Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. +Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.121 s +ALOHA: aloha creates 2 routines in 0.060 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.508s -user 0m0.439s -sys 0m0.064s -Code generation completed in 1 seconds +real 0m1.286s +user 0m0.361s +sys 0m0.100s +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index 2b06bb84d0..59e7d2f86c 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -1059,7 +1123,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1089,11 +1152,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1168,38 +1232,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1378,11 +1411,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 0af9646028..eb37f81cd3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -1,8 +1,8 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) Running MG5 in debug mode -('WARNING: loading of madgraph too slow!!!', 0.5061478614807129) Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT ************************************************************ * * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.01866316795349121  +DEBUG: model prefixing takes 0.0017805099487304688  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.010 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams add process g g > t t~ g INFO: Checking for minimal orders which gives processes. @@ -159,7 +159,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.023 s +1 processes with 16 diagrams generated in 0.017 s Total: 2 processes with 19 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -170,10 +170,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vect INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @2 INFO: Processing color information for process: g g > t t~ g @2 @@ -187,9 +187,9 @@ FileWriter t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -198,25 +198,25 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  -Generated helas calls for 2 subprocesses (19 diagrams) in 0.088 s -Wrote files for 46 helas calls in 0.403 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1589]  +Generated helas calls for 2 subprocesses (19 diagrams) in 0.021 s +Wrote files for 46 helas calls in 2.631 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.419 s +ALOHA: aloha creates 5 routines in 0.182 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.553 s +ALOHA: aloha creates 10 routines in 0.137 s VVV1 VVV1 FFV1 @@ -226,32 +226,34 @@ ALOHA: aloha creates 10 routines in 0.553 s VVVV1 VVVV3 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m5.986s -user 0m4.846s -sys 0m0.948s -Code generation completed in 6 seconds +real 0m8.102s +user 0m1.532s +sys 0m0.740s +Code generation completed in 9 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -272,9 +274,9 @@ Code generation completed in 6 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -301,9 +303,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts +++ b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 55167ebaf3..65712c3058 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -391,8 +455,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -405,8 +472,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -418,8 +488,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -1062,7 +1135,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1092,11 +1164,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1171,38 +1244,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1381,11 +1423,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc index f6e0894592..fa3d841089 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -394,8 +458,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; jamp_sv[2] += amp_sv[0]; @@ -410,8 +477,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -424,8 +494,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -439,8 +512,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -452,8 +528,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -466,8 +545,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -480,8 +562,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; @@ -493,8 +578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -507,8 +595,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; @@ -520,8 +611,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -534,8 +628,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -548,8 +645,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -564,8 +664,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; @@ -577,8 +680,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; @@ -590,8 +696,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -1279,7 +1388,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1309,11 +1417,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1388,38 +1497,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1598,11 +1676,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index e50d05daa6..9f1c215d46 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004053354263305664  +DEBUG: model prefixing takes 0.0018467903137207031  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.016 s +1 processes with 16 diagrams generated in 0.020 s Total: 1 processes with 16 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -161,10 +162,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 @@ -176,25 +177,25 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (16 diagrams) in 0.030 s -Wrote files for 36 helas calls in 0.096 s +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (16 diagrams) in 0.015 s +Wrote files for 36 helas calls in 1.413 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.242 s +ALOHA: aloha creates 5 routines in 0.178 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.216 s +ALOHA: aloha creates 10 routines in 0.134 s VVV1 VVV1 FFV1 @@ -204,32 +205,34 @@ ALOHA: aloha creates 10 routines in 0.216 s VVVV1 VVVV3 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m2.399s -user 0m2.037s -sys 0m0.357s -Code generation completed in 3 seconds +real 0m6.988s +user 0m1.465s +sys 0m0.615s +Code generation completed in 8 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -250,9 +253,9 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -279,9 +282,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttg.mad/Source/make_opts b/epochX/cudacpp/gg_ttg.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttg.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 7f29af7755..c2f3ee7141 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -394,8 +458,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; jamp_sv[2] += amp_sv[0]; @@ -410,8 +477,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -424,8 +494,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -439,8 +512,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -452,8 +528,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -466,8 +545,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -480,8 +562,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; @@ -493,8 +578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -507,8 +595,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; @@ -520,8 +611,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -534,8 +628,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -548,8 +645,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -564,8 +664,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; @@ -577,8 +680,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; @@ -590,8 +696,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -1279,7 +1388,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1309,11 +1417,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1388,38 +1497,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1598,11 +1676,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index ab60b4e5bd..8002f4efbd 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0042188167572021484  +DEBUG: model prefixing takes 0.0017540454864501953  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,13 +151,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.017 s +1 processes with 16 diagrams generated in 0.025 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 @@ -165,18 +166,18 @@ INFO: Processing color information for process: g g > t t~ g @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.029 s +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. +Generated helas calls for 1 subprocesses (16 diagrams) in 0.015 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.230 s +ALOHA: aloha creates 5 routines in 0.142 s VVV1 VVV1 FFV1 @@ -186,17 +187,17 @@ ALOHA: aloha creates 5 routines in 0.230 s VVVV1 VVVV3 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.642s -user 0m0.586s -sys 0m0.050s -Code generation completed in 1 seconds +real 0m1.685s +user 0m0.477s +sys 0m0.156s +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index 3897ffd9b4..98bb331cee 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -1273,7 +1337,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1303,11 +1366,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1382,38 +1446,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1592,11 +1625,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 8c941153c6..951f0e3b1d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004433155059814453  +DEBUG: model prefixing takes 0.0018601417541503906  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.125 s +1 processes with 123 diagrams generated in 0.066 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -161,10 +162,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vecto INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 @@ -176,25 +177,25 @@ FileWriter t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (123 diagrams) in 0.307 s -Wrote files for 222 helas calls in 0.475 s +DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (123 diagrams) in 0.186 s +Wrote files for 222 helas calls in 1.983 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.280 s +ALOHA: aloha creates 5 routines in 0.144 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.246 s +ALOHA: aloha creates 10 routines in 0.152 s VVV1 VVV1 FFV1 @@ -207,32 +208,34 @@ ALOHA: aloha creates 10 routines in 0.246 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m3.426s -user 0m3.041s -sys 0m0.376s -Code generation completed in 4 seconds +real 0m7.349s +user 0m1.941s +sys 0m0.599s +Code generation completed in 8 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -253,9 +256,9 @@ Code generation completed in 4 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -282,9 +285,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index 6664e7c6fc..8c0e33696c 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -430,8 +494,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 VVV1_0( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -450,8 +517,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -470,8 +540,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 VVV1_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -491,8 +564,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -505,8 +581,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -521,8 +600,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -535,8 +617,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -549,8 +634,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -565,8 +653,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -579,8 +670,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -593,8 +687,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -609,8 +706,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -623,8 +723,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -639,8 +742,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -655,8 +761,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[17] -= amp_sv[0]; @@ -673,8 +782,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[16] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; @@ -686,8 +798,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; @@ -699,8 +814,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -714,8 +832,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -730,8 +851,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[20] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -744,8 +868,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -758,8 +885,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 VVV1_0( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -774,8 +904,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -788,8 +921,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 FFV1_0( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[24] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -802,8 +938,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -815,8 +954,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; @@ -828,8 +970,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -841,8 +986,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; @@ -854,8 +1002,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -868,8 +1019,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 VVV1_0( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -910,8 +1064,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] -= amp_sv[0]; @@ -923,8 +1080,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 34 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[33] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] -= amp_sv[0]; @@ -936,8 +1096,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 FFV1_0( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -950,8 +1113,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -966,8 +1132,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 37 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[36] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -980,8 +1149,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 38 FFV1_0( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[37] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -994,8 +1166,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 39 VVV1_0( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[38] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -1010,8 +1185,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 40 FFV1_0( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[39] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1024,8 +1202,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 41 FFV1_0( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[40] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1038,8 +1219,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 42 FFV1_0( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[41] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] -= amp_sv[0]; @@ -1051,8 +1235,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 43 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[42] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] -= amp_sv[0]; @@ -1064,8 +1251,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 44 FFV1_0( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[43] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[23] -= amp_sv[0]; @@ -1077,8 +1267,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 45 FFV1_0( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[44] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] -= amp_sv[0]; @@ -1090,8 +1283,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 46 FFV1_0( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[45] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1104,8 +1300,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 47 VVV1_0( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[46] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -1143,8 +1342,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 49 FFV1_0( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[48] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1157,8 +1359,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 50 FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[49] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -1173,8 +1378,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 51 FFV1_0( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[50] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1187,8 +1395,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 52 FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[51] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1201,8 +1412,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 53 FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[52] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -1217,8 +1431,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 54 FFV1_0( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[53] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1231,8 +1448,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 55 FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[54] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[3] -= amp_sv[0]; @@ -1247,8 +1467,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 56 FFV1_0( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[55] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -1263,8 +1486,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 57 VVV1_0( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[56] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1317,8 +1543,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 59 VVV1_0( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[58] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1337,8 +1566,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 60 VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[59] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1357,8 +1589,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 61 FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[60] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -1373,8 +1608,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 62 FFV1_0( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[61] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1387,8 +1625,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 63 FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[62] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -1403,8 +1644,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 64 FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[63] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1418,8 +1662,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 65 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[64] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1432,8 +1679,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 66 FFV1_0( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[65] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -1448,8 +1698,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 67 FFV1_0( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[66] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1462,8 +1715,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 68 FFV1_0( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[67] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1476,8 +1732,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 69 FFV1_0( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[68] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -1492,8 +1751,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 70 FFV1_0( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[69] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1506,8 +1768,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 71 FFV1_0( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[70] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -1522,8 +1787,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 72 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[71] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[9] -= amp_sv[0]; @@ -1538,8 +1806,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 73 VVV1_0( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[72] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1592,8 +1863,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 75 VVV1_0( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[74] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1612,8 +1886,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 76 VVV1_0( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[75] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1632,8 +1909,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 77 FFV1_0( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[76] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -1648,8 +1928,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 78 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[77] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1662,8 +1945,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 79 FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[78] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -1678,8 +1964,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 80 FFV1_0( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 80 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[79] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1692,8 +1981,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 81 FFV1_0( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[80] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= amp_sv[0]; @@ -1705,8 +1997,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 82 FFV1_0( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[81] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= amp_sv[0]; @@ -1718,8 +2013,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 83 FFV1_0( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[82] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= amp_sv[0]; @@ -1731,8 +2029,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 84 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[83] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= amp_sv[0]; @@ -1744,8 +2045,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 85 FFV1_0( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[84] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1758,8 +2062,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 86 FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[85] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -1774,8 +2081,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 87 FFV1_0( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[86] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= amp_sv[0]; @@ -1787,8 +2097,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 88 FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[87] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] -= amp_sv[0]; @@ -1800,8 +2113,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 89 FFV1_0( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[88] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] -= amp_sv[0]; @@ -1813,8 +2129,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 90 FFV1_0( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 90 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[89] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] -= amp_sv[0]; @@ -1826,8 +2145,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 91 FFV1_0( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[90] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1840,8 +2162,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 92 FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[91] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -1890,8 +2215,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 94 VVV1_0( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[93] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1910,8 +2238,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 95 VVV1_0( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[94] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1930,8 +2261,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 96 FFV1_0( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[95] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[19] -= amp_sv[0]; @@ -1946,8 +2280,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 97 FFV1_0( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[96] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1960,8 +2297,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 98 FFV1_0( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[97] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -1976,8 +2316,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 99 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[98] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2024,8 +2367,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 101 VVV1_0( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[100] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2044,8 +2390,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 102 VVV1_0( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[101] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2064,8 +2413,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 103 FFV1_0( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[102] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -2080,8 +2432,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 104 FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[103] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2094,8 +2449,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 105 FFV1_0( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[104] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[4] -= amp_sv[0]; @@ -2110,8 +2468,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 106 FFV1_0( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[105] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2158,8 +2519,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 108 VVV1_0( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[107] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2178,8 +2542,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 109 VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[108] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2198,8 +2565,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 110 FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[109] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= amp_sv[0]; @@ -2211,8 +2581,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 111 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[110] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] -= amp_sv[0]; @@ -2224,8 +2597,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 112 FFV1_0( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[111] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= amp_sv[0]; @@ -2237,8 +2613,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 113 FFV1_0( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[112] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] -= amp_sv[0]; @@ -3207,7 +3586,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -3237,11 +3615,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -3316,38 +3695,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -3526,11 +3874,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 691a9d08c7..ca2ea3a480 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004384040832519531  +DEBUG: model prefixing takes 0.0019006729125976562  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,13 +151,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.118 s +1 processes with 123 diagrams generated in 0.086 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 @@ -165,18 +166,18 @@ INFO: Processing color information for process: g g > t t~ g g @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.366 s +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. +Generated helas calls for 1 subprocesses (123 diagrams) in 0.192 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.231 s +ALOHA: aloha creates 5 routines in 0.157 s VVV1 VVV1 FFV1 @@ -189,17 +190,17 @@ ALOHA: aloha creates 5 routines in 0.231 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.208s -user 0m1.150s -sys 0m0.049s +real 0m2.034s +user 0m0.812s +sys 0m0.125s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index d43252d697..203de90b29 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -3264,7 +3328,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -3294,11 +3357,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -3373,38 +3437,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -3583,11 +3616,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 5908592d13..47e89cf5f5 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0061588287353515625  +DEBUG: model prefixing takes 0.0018525123596191406  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.427 s +1 processes with 1240 diagrams generated in 0.699 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -161,16 +162,16 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vect INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] -INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction +INFO: Color-Flow passed to 1630 term in 3s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h @@ -178,25 +179,25 @@ FileWriter t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 5.574 s -Wrote files for 2281 helas calls in 17.935 s +DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (1240 diagrams) in 2.548 s +Wrote files for 2281 helas calls in 25.269 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.379 s +ALOHA: aloha creates 5 routines in 0.176 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.232 s +ALOHA: aloha creates 10 routines in 0.172 s VVV1 VVV1 FFV1 @@ -209,32 +210,34 @@ ALOHA: aloha creates 10 routines in 0.232 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m31.040s -user 0m30.219s -sys 0m0.591s -Code generation completed in 31 seconds +real 0m36.059s +user 0m13.905s +sys 0m1.219s +Code generation completed in 36 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -255,9 +258,9 @@ Code generation completed in 31 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -284,9 +287,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index 85e7f8f09c..f9ae4dcde8 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +464,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 VVV1_0( w_fp[9], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; jamp_sv[25] += amp_sv[0]; @@ -428,8 +495,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 VVV1_0( w_fp[9], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; jamp_sv[24] += amp_sv[0]; @@ -515,8 +585,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 VVV1_0( w_fp[12], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; jamp_sv[27] += amp_sv[0]; @@ -543,8 +616,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[12], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; jamp_sv[26] += amp_sv[0]; @@ -629,8 +705,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 VVV1_0( w_fp[14], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; jamp_sv[29] += amp_sv[0]; @@ -657,8 +736,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 VVV1_0( w_fp[14], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; jamp_sv[28] += amp_sv[0]; @@ -981,8 +1063,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 VVV1_0( w_fp[24], w_fp[6], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -1009,8 +1094,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[8], w_fp[6], w_fp[26], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -1037,8 +1125,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 VVV1_0( w_fp[8], w_fp[24], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -1123,8 +1214,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 VVV1_0( w_fp[27], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -1151,8 +1245,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 VVV1_0( w_fp[8], w_fp[5], w_fp[28], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[4] -= amp_sv[0]; @@ -1179,8 +1276,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[8], w_fp[27], w_fp[12], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[3] -= amp_sv[0]; @@ -1265,8 +1365,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 VVV1_0( w_fp[4], w_fp[29], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -1293,8 +1396,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 VVV1_0( w_fp[8], w_fp[29], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -1321,8 +1427,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 VVV1_0( w_fp[8], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -1411,8 +1520,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[34], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1425,8 +1537,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[34], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1439,8 +1554,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 VVV1_0( w_fp[12], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1459,8 +1577,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[3], w_fp[36], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] += amp_sv[0]; jamp_sv[68] -= amp_sv[0]; @@ -1475,8 +1596,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 VVV1_0( w_fp[14], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1495,8 +1619,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[3], w_fp[35], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] += amp_sv[0]; jamp_sv[62] -= amp_sv[0]; @@ -1546,8 +1673,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[38], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1560,8 +1690,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 34 FFV1_0( w_fp[40], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[33] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1574,8 +1707,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 FFV1_0( w_fp[38], w_fp[33], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] += amp_sv[0]; jamp_sv[55] -= amp_sv[0]; @@ -1590,8 +1726,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 FFV1_0( w_fp[41], w_fp[39], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1604,8 +1743,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 37 FFV1_0( w_fp[42], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[36] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1618,8 +1760,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 38 FFV1_0( w_fp[41], w_fp[33], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[37] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += amp_sv[0]; jamp_sv[54] -= amp_sv[0]; @@ -1634,8 +1779,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 39 FFV1_0( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[38] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += amp_sv[0]; jamp_sv[49] -= amp_sv[0]; @@ -1650,8 +1798,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 40 FFV1_0( w_fp[34], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[39] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] += amp_sv[0]; jamp_sv[65] -= amp_sv[0]; @@ -1666,8 +1817,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 41 FFV1_0( w_fp[3], w_fp[33], w_fp[25], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[40] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1687,8 +1841,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 42 FFV1_0( w_fp[34], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[41] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1701,8 +1858,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 43 FFV1_0( w_fp[34], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[42] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1715,8 +1875,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 44 VVV1_0( w_fp[9], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[43] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1735,8 +1898,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 45 FFV1_0( w_fp[3], w_fp[44], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[44] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] += amp_sv[0]; jamp_sv[92] -= amp_sv[0]; @@ -1751,8 +1917,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 46 VVV1_0( w_fp[14], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[45] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1771,8 +1940,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 47 FFV1_0( w_fp[3], w_fp[43], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[46] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[84] += amp_sv[0]; jamp_sv[86] -= amp_sv[0]; @@ -1822,8 +1994,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 49 FFV1_0( w_fp[46], w_fp[47], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[48] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1836,8 +2011,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 50 FFV1_0( w_fp[48], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[49] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1850,8 +2028,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 51 FFV1_0( w_fp[46], w_fp[39], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[50] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] += amp_sv[0]; jamp_sv[79] -= amp_sv[0]; @@ -1866,8 +2047,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 52 FFV1_0( w_fp[41], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[51] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1880,8 +2064,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 53 FFV1_0( w_fp[42], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[52] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1894,8 +2081,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 54 FFV1_0( w_fp[41], w_fp[39], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[53] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += amp_sv[0]; jamp_sv[78] -= amp_sv[0]; @@ -1910,8 +2100,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 55 FFV1_0( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[54] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += amp_sv[0]; jamp_sv[73] -= amp_sv[0]; @@ -1926,8 +2119,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 56 FFV1_0( w_fp[34], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[55] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[88] += amp_sv[0]; jamp_sv[89] -= amp_sv[0]; @@ -1942,8 +2138,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 57 FFV1_0( w_fp[3], w_fp[39], w_fp[28], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[56] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1963,8 +2162,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 58 FFV1_0( w_fp[34], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 58 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[57] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1977,8 +2179,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 59 FFV1_0( w_fp[34], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[58] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1991,8 +2196,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 60 VVV1_0( w_fp[9], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[59] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2011,8 +2219,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 61 FFV1_0( w_fp[3], w_fp[50], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[60] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[114] += amp_sv[0]; jamp_sv[116] -= amp_sv[0]; @@ -2027,8 +2238,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 62 VVV1_0( w_fp[12], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[61] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2047,8 +2261,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 63 FFV1_0( w_fp[3], w_fp[49], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[62] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[108] += amp_sv[0]; jamp_sv[110] -= amp_sv[0]; @@ -2097,8 +2314,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 65 FFV1_0( w_fp[46], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[64] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2111,8 +2331,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 66 FFV1_0( w_fp[48], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[65] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2125,8 +2348,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 67 FFV1_0( w_fp[46], w_fp[47], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[66] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] += amp_sv[0]; jamp_sv[103] -= amp_sv[0]; @@ -2141,8 +2367,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 68 FFV1_0( w_fp[38], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[67] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2155,8 +2384,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 69 FFV1_0( w_fp[40], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[68] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2169,8 +2401,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 70 FFV1_0( w_fp[38], w_fp[47], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[69] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += amp_sv[0]; jamp_sv[102] -= amp_sv[0]; @@ -2185,8 +2420,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 71 FFV1_0( w_fp[3], w_fp[52], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[70] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += amp_sv[0]; jamp_sv[97] -= amp_sv[0]; @@ -2201,8 +2439,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 72 FFV1_0( w_fp[34], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[71] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[112] += amp_sv[0]; jamp_sv[113] -= amp_sv[0]; @@ -2217,8 +2458,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 73 FFV1_0( w_fp[3], w_fp[47], w_fp[26], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[72] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2238,8 +2482,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 74 FFV1_0( w_fp[7], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 74 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[73] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2252,8 +2499,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 75 FFV1_0( w_fp[53], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[74] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2266,8 +2516,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 76 VVV1_0( w_fp[12], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[75] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2286,8 +2539,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 77 FFV1_0( w_fp[53], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[76] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[27] -= amp_sv[0]; @@ -2302,8 +2558,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 78 VVV1_0( w_fp[14], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[77] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2322,8 +2581,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 79 FFV1_0( w_fp[7], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[78] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += amp_sv[0]; jamp_sv[29] -= amp_sv[0]; @@ -2372,8 +2634,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 81 FFV1_0( w_fp[46], w_fp[52], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[80] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -2388,8 +2653,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 82 FFV1_0( w_fp[48], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[81] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] += amp_sv[0]; jamp_sv[92] -= amp_sv[0]; @@ -2404,8 +2672,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 83 FFV1_0( w_fp[46], w_fp[2], w_fp[25], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[82] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2424,8 +2695,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 84 FFV1_0( w_fp[25], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[83] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2438,8 +2712,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 85 FFV1_0( w_fp[48], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[84] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2452,8 +2729,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 86 VVV1_0( w_fp[9], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[85] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2472,8 +2752,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 87 FFV1_0( w_fp[48], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[86] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[25] -= amp_sv[0]; @@ -2488,8 +2771,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 88 VVV1_0( w_fp[14], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[87] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2508,8 +2794,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 89 FFV1_0( w_fp[25], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[88] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[28] -= amp_sv[0]; @@ -2558,8 +2847,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 91 FFV1_0( w_fp[38], w_fp[52], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[90] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[4] -= amp_sv[0]; @@ -2574,8 +2866,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 92 FFV1_0( w_fp[40], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[91] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] += amp_sv[0]; jamp_sv[68] -= amp_sv[0]; @@ -2590,8 +2885,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 93 FFV1_0( w_fp[38], w_fp[2], w_fp[28], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 93 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[92] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2610,8 +2908,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 94 FFV1_0( w_fp[28], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[93] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2624,8 +2925,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 95 FFV1_0( w_fp[40], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[94] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2638,8 +2942,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 96 VVV1_0( w_fp[9], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[95] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2658,8 +2965,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 97 FFV1_0( w_fp[40], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[96] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[24] -= amp_sv[0]; @@ -2674,8 +2984,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 98 VVV1_0( w_fp[12], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[97] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2694,8 +3007,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 99 FFV1_0( w_fp[28], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[98] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[26] -= amp_sv[0]; @@ -2744,8 +3060,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 101 FFV1_0( w_fp[41], w_fp[52], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[100] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -2760,8 +3079,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 102 FFV1_0( w_fp[42], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[101] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] += amp_sv[0]; jamp_sv[62] -= amp_sv[0]; @@ -2776,8 +3098,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 103 FFV1_0( w_fp[41], w_fp[2], w_fp[26], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[102] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2796,8 +3121,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 104 FFV1_0( w_fp[26], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[103] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -2812,8 +3140,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 105 FFV1_0( w_fp[3], w_fp[52], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[104] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2832,8 +3163,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 106 FFV1_0( w_fp[34], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[105] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] += amp_sv[0]; jamp_sv[65] -= amp_sv[0]; @@ -2848,8 +3182,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 107 FFV1_0( w_fp[34], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 107 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[106] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2868,8 +3205,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 108 FFV1_0( w_fp[3], w_fp[17], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[107] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2888,8 +3228,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 109 FFV1_0( w_fp[26], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[108] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2908,8 +3251,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 110 FFV1_0( w_fp[14], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[109] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[3] -= amp_sv[0]; @@ -2924,8 +3270,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 111 FFV1_0( w_fp[3], w_fp[52], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[110] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2944,8 +3293,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 112 FFV1_0( w_fp[34], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[111] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[70] += amp_sv[0]; jamp_sv[71] -= amp_sv[0]; @@ -2960,8 +3312,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 113 FFV1_0( w_fp[34], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[112] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2980,8 +3335,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 114 FFV1_0( w_fp[3], w_fp[15], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 114 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[113] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3000,8 +3358,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 115 FFV1_0( w_fp[14], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 115 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[114] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3020,8 +3381,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 116 FFV1_0( w_fp[12], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 116 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[115] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -3036,8 +3400,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 117 FFV1_0( w_fp[3], w_fp[52], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 117 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[116] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3056,8 +3423,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 118 FFV1_0( w_fp[34], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 118 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[117] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[94] += amp_sv[0]; jamp_sv[95] -= amp_sv[0]; @@ -3072,8 +3442,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 119 FFV1_0( w_fp[34], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 119 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[118] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3092,8 +3465,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 120 FFV1_0( w_fp[3], w_fp[18], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 120 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[119] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3112,8 +3488,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 121 FFV1_0( w_fp[12], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 121 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[120] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3203,8 +3582,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 124 FFV1_0( w_fp[22], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 124 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[123] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] -= amp_sv[0]; @@ -3216,8 +3598,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 125 FFV1_0( w_fp[21], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 125 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[124] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] -= amp_sv[0]; @@ -3230,8 +3615,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 126 FFV1_0( w_fp[56], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 126 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[125] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] -= amp_sv[0]; @@ -3243,9 +3631,12 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 127 FFV1_0( w_fp[21], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 127 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif + if( channelId != 0 ) + { + numerators_sv[126] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } +#endif jamp_sv[15] -= amp_sv[0]; // *** DIAGRAM 128 OF 1240 *** @@ -3256,8 +3647,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 128 FFV1_0( w_fp[56], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 128 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[127] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[23] -= amp_sv[0]; @@ -3269,8 +3663,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 129 FFV1_0( w_fp[22], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 129 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[128] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] -= amp_sv[0]; @@ -3282,8 +3679,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 130 VVV1_0( w_fp[24], w_fp[6], w_fp[58], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 130 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[129] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -3298,8 +3698,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 131 FFV1_0( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 131 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[130] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3312,8 +3715,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 132 FFV1_0( w_fp[52], w_fp[57], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 132 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[131] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3326,8 +3732,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 133 VVV1_0( w_fp[27], w_fp[5], w_fp[58], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 133 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[132] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -3342,8 +3751,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 134 FFV1_0( w_fp[52], w_fp[60], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 134 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[133] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3356,8 +3768,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 135 FFV1_0( w_fp[52], w_fp[55], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 135 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[134] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3370,8 +3785,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 136 VVV1_0( w_fp[4], w_fp[29], w_fp[58], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 136 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[135] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -3386,8 +3804,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 137 FFV1_0( w_fp[52], w_fp[9], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 137 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[136] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3400,8 +3821,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 138 FFV1_0( w_fp[52], w_fp[58], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 138 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[137] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3438,8 +3862,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 140 VVV1_0( w_fp[62], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 140 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[139] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3458,8 +3885,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 141 VVV1_0( w_fp[62], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 141 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[140] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3512,8 +3942,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 143 FFV1_0( w_fp[65], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 143 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[142] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3526,8 +3959,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 144 FFV1_0( w_fp[3], w_fp[55], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 144 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[143] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -3542,8 +3978,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 145 FFV1_0( w_fp[65], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 145 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[144] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3556,8 +3995,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 146 FFV1_0( w_fp[3], w_fp[57], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 146 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[145] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -3572,8 +4014,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 147 FFV1_0( w_fp[38], w_fp[66], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 147 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[146] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3586,8 +4031,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 148 VVV1_0( w_fp[61], w_fp[6], w_fp[67], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 148 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[147] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -3602,8 +4050,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 149 FFV1_0( w_fp[38], w_fp[57], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 149 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[148] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3616,8 +4067,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 150 FFV1_0( w_fp[41], w_fp[66], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 150 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[149] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3630,8 +4084,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 151 VVV1_0( w_fp[61], w_fp[5], w_fp[68], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 151 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[150] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -3646,8 +4103,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 152 FFV1_0( w_fp[41], w_fp[55], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 152 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[151] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3660,8 +4120,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 153 FFV1_0( w_fp[3], w_fp[66], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 153 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[152] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -3676,8 +4139,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 154 VVV1_0( w_fp[61], w_fp[29], w_fp[62], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 154 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[153] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3696,8 +4162,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 155 FFV1_0( w_fp[3], w_fp[58], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 155 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[154] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[17] -= amp_sv[0]; @@ -3713,8 +4182,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 156 VVV1_0( w_fp[62], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 156 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[155] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3733,8 +4205,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 157 VVV1_0( w_fp[62], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 157 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[156] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3787,8 +4262,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 159 FFV1_0( w_fp[71], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 159 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[158] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3801,8 +4279,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 160 FFV1_0( w_fp[3], w_fp[9], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 160 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[159] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -3817,8 +4298,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 161 FFV1_0( w_fp[71], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 161 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[160] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3831,8 +4315,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 162 FFV1_0( w_fp[3], w_fp[57], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 162 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[161] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -3847,8 +4334,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 163 FFV1_0( w_fp[46], w_fp[72], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 163 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[162] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3861,8 +4351,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 164 VVV1_0( w_fp[66], w_fp[6], w_fp[73], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 164 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[163] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -3877,8 +4370,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 165 FFV1_0( w_fp[46], w_fp[57], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 165 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[164] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3891,8 +4387,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 166 FFV1_0( w_fp[41], w_fp[72], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 166 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[165] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3905,8 +4404,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 167 VVV1_0( w_fp[66], w_fp[4], w_fp[68], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 167 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[166] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -3921,8 +4423,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 168 FFV1_0( w_fp[41], w_fp[9], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 168 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[167] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3935,8 +4440,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 169 FFV1_0( w_fp[3], w_fp[72], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 169 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[168] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[3] -= amp_sv[0]; @@ -3951,8 +4459,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 170 VVV1_0( w_fp[66], w_fp[27], w_fp[62], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 170 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[169] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3971,8 +4482,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 171 FFV1_0( w_fp[3], w_fp[60], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 171 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[170] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -3988,8 +4502,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 172 VVV1_0( w_fp[62], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 172 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[171] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4008,8 +4525,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 173 VVV1_0( w_fp[62], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 173 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[172] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4062,8 +4582,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 175 FFV1_0( w_fp[76], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 175 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[174] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4076,8 +4599,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 176 FFV1_0( w_fp[3], w_fp[9], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 176 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[175] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -4092,8 +4618,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 177 FFV1_0( w_fp[76], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 177 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[176] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4106,8 +4635,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 178 FFV1_0( w_fp[3], w_fp[55], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 178 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[177] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -4122,8 +4654,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 179 FFV1_0( w_fp[46], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 179 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[178] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4136,8 +4671,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 180 VVV1_0( w_fp[72], w_fp[5], w_fp[73], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 180 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[179] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -4152,8 +4690,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 181 FFV1_0( w_fp[46], w_fp[55], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 181 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[180] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4166,8 +4707,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 182 FFV1_0( w_fp[38], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 182 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[181] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4180,8 +4724,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 183 VVV1_0( w_fp[72], w_fp[4], w_fp[67], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 183 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[182] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -4196,8 +4743,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 184 FFV1_0( w_fp[38], w_fp[9], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 184 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[183] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4210,8 +4760,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 185 FFV1_0( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 185 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[184] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -4226,8 +4779,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 186 VVV1_0( w_fp[72], w_fp[24], w_fp[62], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 186 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[185] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4246,8 +4802,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 187 FFV1_0( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 187 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[186] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[9] -= amp_sv[0]; @@ -4262,8 +4821,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 188 FFV1_0( w_fp[7], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 188 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[187] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; @@ -4275,8 +4837,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 189 FFV1_0( w_fp[53], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 189 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[188] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; @@ -4288,8 +4853,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 190 FFV1_0( w_fp[78], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 190 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[189] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= amp_sv[0]; @@ -4301,8 +4869,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 191 FFV1_0( w_fp[53], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 191 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[190] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] -= amp_sv[0]; @@ -4314,8 +4885,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 192 FFV1_0( w_fp[78], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 192 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[191] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] -= amp_sv[0]; @@ -4327,8 +4901,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 193 FFV1_0( w_fp[7], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 193 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[192] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] -= amp_sv[0]; @@ -4340,8 +4917,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 194 FFV1_0( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 194 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[193] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4354,8 +4934,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 195 VVV1_0( w_fp[1], w_fp[29], w_fp[73], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 195 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[194] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -4370,8 +4953,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 196 FFV1_0( w_fp[46], w_fp[58], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 196 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[195] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4384,8 +4970,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 197 FFV1_0( w_fp[25], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 197 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[196] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; @@ -4397,8 +4986,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 198 FFV1_0( w_fp[48], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 198 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[197] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -4410,8 +5002,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 199 FFV1_0( w_fp[58], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 199 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[198] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= amp_sv[0]; @@ -4423,8 +5018,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 200 FFV1_0( w_fp[48], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 200 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[199] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= amp_sv[0]; @@ -4436,8 +5034,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 201 FFV1_0( w_fp[58], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 201 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[200] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] -= amp_sv[0]; @@ -4449,8 +5050,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 202 FFV1_0( w_fp[25], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 202 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[201] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= amp_sv[0]; @@ -4462,8 +5066,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 203 FFV1_0( w_fp[38], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 203 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[202] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4476,8 +5083,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 204 VVV1_0( w_fp[1], w_fp[27], w_fp[67], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 204 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[203] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[4] -= amp_sv[0]; @@ -4492,8 +5102,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 205 FFV1_0( w_fp[38], w_fp[60], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 205 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[204] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4506,8 +5119,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 206 FFV1_0( w_fp[28], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 206 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[205] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; @@ -4519,8 +5135,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 207 FFV1_0( w_fp[40], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 207 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[206] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -4532,8 +5151,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 208 FFV1_0( w_fp[60], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 208 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[207] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= amp_sv[0]; @@ -4545,8 +5167,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 209 FFV1_0( w_fp[40], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 209 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[208] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= amp_sv[0]; @@ -4558,8 +5183,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 210 FFV1_0( w_fp[60], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 210 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[209] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] -= amp_sv[0]; @@ -4571,8 +5199,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 211 FFV1_0( w_fp[28], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 211 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[210] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= amp_sv[0]; @@ -4584,8 +5215,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 212 FFV1_0( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 212 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[211] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4598,8 +5232,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 213 VVV1_0( w_fp[1], w_fp[24], w_fp[68], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 213 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[212] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -4614,8 +5251,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 214 FFV1_0( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 214 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[213] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4628,8 +5268,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 215 FFV1_0( w_fp[26], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 215 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[214] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4642,8 +5285,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 216 FFV1_0( w_fp[3], w_fp[77], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 216 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[215] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -4658,8 +5304,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 217 VVV1_0( w_fp[62], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 217 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[216] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4678,8 +5327,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 218 VVV1_0( w_fp[62], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 218 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[217] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4732,8 +5384,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 220 FFV1_0( w_fp[3], w_fp[57], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 220 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[219] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[19] -= amp_sv[0]; @@ -4748,8 +5403,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 221 FFV1_0( w_fp[26], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 221 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[220] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4762,8 +5420,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 222 FFV1_0( w_fp[14], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 222 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[221] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4776,8 +5437,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 223 FFV1_0( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 223 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[222] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -4792,8 +5456,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 224 VVV1_0( w_fp[62], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 224 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[223] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4812,8 +5479,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 225 VVV1_0( w_fp[62], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 225 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[224] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4866,8 +5536,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 227 FFV1_0( w_fp[3], w_fp[55], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 227 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[226] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -4882,8 +5555,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 228 FFV1_0( w_fp[14], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 228 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[227] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4896,8 +5572,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 229 FFV1_0( w_fp[12], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 229 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[228] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4910,8 +5589,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 230 FFV1_0( w_fp[3], w_fp[77], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 230 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[229] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -4926,8 +5608,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 231 VVV1_0( w_fp[62], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 231 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[230] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4946,8 +5631,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 232 VVV1_0( w_fp[62], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 232 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[231] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; @@ -5000,8 +5688,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 234 FFV1_0( w_fp[3], w_fp[9], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 234 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[233] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -5016,8 +5707,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 235 FFV1_0( w_fp[12], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 235 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[234] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5329,8 +6023,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 247 FFV1_0( w_fp[34], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 247 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[246] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[41] -= amp_sv[0]; @@ -5342,8 +6039,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 248 FFV1_0( w_fp[34], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 248 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[247] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[47] -= amp_sv[0]; @@ -5356,8 +6056,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 249 FFV1_0( w_fp[86], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 249 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[248] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] -= amp_sv[0]; @@ -5369,8 +6072,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 250 FFV1_0( w_fp[86], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 250 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[249] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[45] -= amp_sv[0]; @@ -5382,8 +6088,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 251 FFV1_0( w_fp[88], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 251 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[250] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] -= amp_sv[0]; @@ -5395,8 +6104,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 252 FFV1_0( w_fp[88], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 252 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[251] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[39] -= amp_sv[0]; @@ -5408,8 +6120,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 253 VVV1_0( w_fp[24], w_fp[6], w_fp[89], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 253 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[252] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += amp_sv[0]; jamp_sv[39] -= amp_sv[0]; @@ -5424,8 +6139,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 254 FFV1_0( w_fp[90], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 254 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[253] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5438,8 +6156,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 255 FFV1_0( w_fp[88], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 255 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[254] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5452,8 +6173,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 256 VVV1_0( w_fp[27], w_fp[5], w_fp[89], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 256 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[255] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] += amp_sv[0]; jamp_sv[39] -= amp_sv[0]; @@ -5468,8 +6192,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 257 FFV1_0( w_fp[91], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 257 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[256] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5482,8 +6209,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 258 FFV1_0( w_fp[86], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 258 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[257] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5496,8 +6226,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 259 VVV1_0( w_fp[4], w_fp[29], w_fp[89], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 259 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[258] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += amp_sv[0]; jamp_sv[35] -= amp_sv[0]; @@ -5512,8 +6245,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 260 FFV1_0( w_fp[34], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 260 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[259] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5526,8 +6262,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 261 FFV1_0( w_fp[89], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 261 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[260] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5562,8 +6301,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 263 VVV1_0( w_fp[92], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 263 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[262] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; @@ -5582,8 +6324,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 264 VVV1_0( w_fp[92], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 264 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[263] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; @@ -5636,8 +6381,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 266 FFV1_0( w_fp[86], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 266 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[265] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5650,8 +6398,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 267 FFV1_0( w_fp[86], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 267 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[266] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] += amp_sv[0]; jamp_sv[59] -= amp_sv[0]; @@ -5666,8 +6417,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 268 FFV1_0( w_fp[88], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 268 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[267] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5680,8 +6434,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 269 FFV1_0( w_fp[88], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 269 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[268] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += amp_sv[0]; jamp_sv[57] -= amp_sv[0]; @@ -5696,8 +6453,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 270 FFV1_0( w_fp[94], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 270 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[269] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5710,8 +6470,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 271 VVV1_0( w_fp[61], w_fp[6], w_fp[95], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 271 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[270] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[81] += amp_sv[0]; jamp_sv[87] -= amp_sv[0]; @@ -5726,8 +6489,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 272 FFV1_0( w_fp[88], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 272 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[271] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5740,8 +6506,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 273 FFV1_0( w_fp[94], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 273 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[272] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5754,8 +6523,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 274 VVV1_0( w_fp[61], w_fp[5], w_fp[96], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 274 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[273] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[105] += amp_sv[0]; jamp_sv[111] -= amp_sv[0]; @@ -5770,8 +6542,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 275 FFV1_0( w_fp[86], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 275 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[274] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5784,8 +6559,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 276 FFV1_0( w_fp[94], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 276 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[275] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[93] += amp_sv[0]; jamp_sv[95] -= amp_sv[0]; @@ -5800,8 +6578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 277 VVV1_0( w_fp[61], w_fp[29], w_fp[92], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 277 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[276] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; @@ -5820,8 +6601,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 278 FFV1_0( w_fp[89], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 278 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[277] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += amp_sv[0]; jamp_sv[35] -= amp_sv[0]; @@ -5836,8 +6620,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 279 VVV1_0( w_fp[92], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 279 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[278] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; @@ -5856,8 +6643,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 280 VVV1_0( w_fp[92], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 280 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[279] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; @@ -5910,8 +6700,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 282 FFV1_0( w_fp[34], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 282 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[281] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5924,8 +6717,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 283 FFV1_0( w_fp[34], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 283 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[282] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[41] += amp_sv[0]; jamp_sv[83] -= amp_sv[0]; @@ -5940,8 +6736,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 284 FFV1_0( w_fp[88], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 284 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[283] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5954,8 +6753,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 285 FFV1_0( w_fp[88], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 285 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[284] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[39] += amp_sv[0]; jamp_sv[57] -= amp_sv[0]; @@ -5970,8 +6772,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 286 FFV1_0( w_fp[97], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 286 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[285] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5984,8 +6789,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 287 VVV1_0( w_fp[66], w_fp[6], w_fp[98], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 287 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[286] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[57] += amp_sv[0]; jamp_sv[63] -= amp_sv[0]; @@ -6000,8 +6808,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 288 FFV1_0( w_fp[88], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 288 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[287] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6014,8 +6825,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 289 FFV1_0( w_fp[97], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 289 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[288] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6028,8 +6842,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 290 VVV1_0( w_fp[66], w_fp[4], w_fp[96], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 290 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[289] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[107] += amp_sv[0]; jamp_sv[111] -= amp_sv[0]; @@ -6044,8 +6861,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 291 FFV1_0( w_fp[34], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 291 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[290] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6058,8 +6878,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 292 FFV1_0( w_fp[97], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 292 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[291] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[69] += amp_sv[0]; jamp_sv[71] -= amp_sv[0]; @@ -6074,8 +6897,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 293 VVV1_0( w_fp[66], w_fp[27], w_fp[92], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 293 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[292] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6094,8 +6920,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 294 FFV1_0( w_fp[91], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 294 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[293] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[39] += amp_sv[0]; jamp_sv[41] -= amp_sv[0]; @@ -6110,8 +6939,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 295 VVV1_0( w_fp[92], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 295 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[294] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6130,8 +6962,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 296 VVV1_0( w_fp[92], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 296 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[295] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6184,8 +7019,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 298 FFV1_0( w_fp[34], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 298 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[297] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6198,8 +7036,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 299 FFV1_0( w_fp[34], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 299 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[298] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[47] += amp_sv[0]; jamp_sv[83] -= amp_sv[0]; @@ -6214,8 +7055,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 300 FFV1_0( w_fp[86], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 300 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[299] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6228,8 +7072,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 301 FFV1_0( w_fp[86], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 301 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[300] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[45] += amp_sv[0]; jamp_sv[59] -= amp_sv[0]; @@ -6244,8 +7091,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 302 FFV1_0( w_fp[99], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 302 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[301] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6258,8 +7108,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 303 VVV1_0( w_fp[72], w_fp[5], w_fp[98], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 303 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[302] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[59] += amp_sv[0]; jamp_sv[63] -= amp_sv[0]; @@ -6274,8 +7127,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 304 FFV1_0( w_fp[86], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 304 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[303] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6288,8 +7144,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 305 FFV1_0( w_fp[99], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 305 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[304] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6302,8 +7161,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 306 VVV1_0( w_fp[72], w_fp[4], w_fp[95], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 306 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[305] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[83] += amp_sv[0]; jamp_sv[87] -= amp_sv[0]; @@ -6318,8 +7180,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 307 FFV1_0( w_fp[34], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 307 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[306] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6332,8 +7197,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 308 FFV1_0( w_fp[99], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 308 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[307] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[63] += amp_sv[0]; jamp_sv[65] -= amp_sv[0]; @@ -6348,8 +7216,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 309 VVV1_0( w_fp[72], w_fp[24], w_fp[92], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 309 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[308] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6368,8 +7239,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 310 FFV1_0( w_fp[90], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 310 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[309] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[45] += amp_sv[0]; jamp_sv[47] -= amp_sv[0]; @@ -6384,8 +7258,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 311 FFV1_0( w_fp[99], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 311 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[310] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[65] -= amp_sv[0]; @@ -6397,8 +7274,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 312 FFV1_0( w_fp[99], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 312 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[311] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[71] -= amp_sv[0]; @@ -6410,8 +7290,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 313 FFV1_0( w_fp[86], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 313 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[312] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[59] -= amp_sv[0]; @@ -6423,8 +7306,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 314 FFV1_0( w_fp[86], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 314 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[313] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[69] -= amp_sv[0]; @@ -6436,8 +7322,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 315 FFV1_0( w_fp[88], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 315 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[314] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[57] -= amp_sv[0]; @@ -6449,8 +7338,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 316 FFV1_0( w_fp[88], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 316 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[315] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[63] -= amp_sv[0]; @@ -6462,8 +7354,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 317 FFV1_0( w_fp[99], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 317 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[316] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6476,8 +7371,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 318 VVV1_0( w_fp[1], w_fp[29], w_fp[98], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 318 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[317] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[57] += amp_sv[0]; jamp_sv[59] -= amp_sv[0]; @@ -6492,8 +7390,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 319 FFV1_0( w_fp[89], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 319 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[318] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6506,8 +7407,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 320 FFV1_0( w_fp[99], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 320 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[319] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[89] -= amp_sv[0]; @@ -6519,8 +7423,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 321 FFV1_0( w_fp[99], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 321 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[320] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[95] -= amp_sv[0]; @@ -6532,8 +7439,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 322 FFV1_0( w_fp[34], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 322 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[321] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[83] -= amp_sv[0]; @@ -6545,8 +7455,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 323 FFV1_0( w_fp[34], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 323 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[322] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[93] -= amp_sv[0]; @@ -6558,8 +7471,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 324 FFV1_0( w_fp[88], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 324 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[323] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[81] -= amp_sv[0]; @@ -6571,8 +7487,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 325 FFV1_0( w_fp[88], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 325 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[324] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[87] -= amp_sv[0]; @@ -6584,8 +7503,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 326 FFV1_0( w_fp[99], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 326 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[325] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6598,8 +7520,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 327 VVV1_0( w_fp[1], w_fp[27], w_fp[95], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 327 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[326] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[81] += amp_sv[0]; jamp_sv[83] -= amp_sv[0]; @@ -6614,8 +7539,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 328 FFV1_0( w_fp[91], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 328 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[327] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6628,8 +7556,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 329 FFV1_0( w_fp[99], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 329 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[328] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[113] -= amp_sv[0]; @@ -6641,8 +7572,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 330 FFV1_0( w_fp[99], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 330 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[329] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[119] -= amp_sv[0]; @@ -6654,8 +7588,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 331 FFV1_0( w_fp[34], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 331 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[330] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[107] -= amp_sv[0]; @@ -6667,8 +7604,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 332 FFV1_0( w_fp[34], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 332 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[331] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[117] -= amp_sv[0]; @@ -6680,8 +7620,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 333 FFV1_0( w_fp[86], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 333 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[332] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[105] -= amp_sv[0]; @@ -6693,8 +7636,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 334 FFV1_0( w_fp[86], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 334 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[333] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[111] -= amp_sv[0]; @@ -6706,8 +7652,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 335 FFV1_0( w_fp[99], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 335 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[334] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6720,8 +7669,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 336 VVV1_0( w_fp[1], w_fp[24], w_fp[96], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 336 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[335] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[105] += amp_sv[0]; jamp_sv[107] -= amp_sv[0]; @@ -6736,8 +7688,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 337 FFV1_0( w_fp[90], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 337 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[336] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6750,8 +7705,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 338 FFV1_0( w_fp[99], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 338 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[337] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6764,8 +7722,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 339 FFV1_0( w_fp[99], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 339 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[338] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[65] += amp_sv[0]; jamp_sv[89] -= amp_sv[0]; @@ -6780,8 +7741,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 340 VVV1_0( w_fp[92], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 340 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[339] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6800,8 +7764,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 341 VVV1_0( w_fp[92], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 341 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[340] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6854,8 +7821,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 343 FFV1_0( w_fp[88], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 343 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[342] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += amp_sv[0]; jamp_sv[39] -= amp_sv[0]; @@ -6870,8 +7840,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 344 FFV1_0( w_fp[88], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 344 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[343] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6884,8 +7857,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 345 FFV1_0( w_fp[99], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 345 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[344] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6898,8 +7874,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 346 FFV1_0( w_fp[99], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 346 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[345] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[71] += amp_sv[0]; jamp_sv[89] -= amp_sv[0]; @@ -6914,8 +7893,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 347 VVV1_0( w_fp[92], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 347 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[346] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6934,8 +7916,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 348 VVV1_0( w_fp[92], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 348 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[347] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6988,8 +7973,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 350 FFV1_0( w_fp[86], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 350 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[349] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] += amp_sv[0]; jamp_sv[45] -= amp_sv[0]; @@ -7004,8 +7992,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 351 FFV1_0( w_fp[86], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 351 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[350] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7018,8 +8009,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 352 FFV1_0( w_fp[99], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 352 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[351] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7032,8 +8026,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 353 FFV1_0( w_fp[99], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 353 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[352] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[65] += amp_sv[0]; jamp_sv[71] -= amp_sv[0]; @@ -7048,8 +8045,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 354 VVV1_0( w_fp[92], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 354 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[353] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; @@ -7068,8 +8068,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 355 VVV1_0( w_fp[92], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 355 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[354] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; @@ -7122,8 +8125,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 357 FFV1_0( w_fp[34], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 357 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[356] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[41] += amp_sv[0]; jamp_sv[47] -= amp_sv[0]; @@ -7138,8 +8144,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 358 FFV1_0( w_fp[34], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 358 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[357] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7443,8 +8452,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 370 FFV1_0( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 370 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[369] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7457,8 +8469,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 371 FFV1_0( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 371 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[370] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7472,8 +8487,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 372 VVV1_0( w_fp[62], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 372 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[371] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7492,8 +8510,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 373 FFV1_0( w_fp[3], w_fp[85], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 373 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[372] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] += amp_sv[0]; jamp_sv[44] -= amp_sv[0]; @@ -7508,8 +8529,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 374 VVV1_0( w_fp[86], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 374 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[373] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7528,8 +8552,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 375 FFV1_0( w_fp[3], w_fp[9], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 375 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[374] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] += amp_sv[0]; jamp_sv[38] -= amp_sv[0]; @@ -7580,8 +8607,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 377 FFV1_0( w_fp[38], w_fp[95], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 377 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[376] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7594,8 +8624,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 378 FFV1_0( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 378 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[377] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7608,8 +8641,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 379 FFV1_0( w_fp[38], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 379 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[378] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += amp_sv[0]; jamp_sv[31] -= amp_sv[0]; @@ -7624,8 +8660,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 380 FFV1_0( w_fp[41], w_fp[95], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 380 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[379] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7638,8 +8677,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 381 FFV1_0( w_fp[101], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 381 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[380] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7652,8 +8694,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 382 FFV1_0( w_fp[41], w_fp[77], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 382 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[381] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += amp_sv[0]; jamp_sv[30] -= amp_sv[0]; @@ -7668,8 +8713,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 383 FFV1_0( w_fp[3], w_fp[95], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 383 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[382] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += amp_sv[0]; jamp_sv[25] -= amp_sv[0]; @@ -7684,8 +8732,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 384 FFV1_0( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 384 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[383] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] += amp_sv[0]; jamp_sv[41] -= amp_sv[0]; @@ -7700,8 +8751,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 385 FFV1_0( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 385 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[384] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; @@ -7720,8 +8774,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 386 FFV1_0( w_fp[22], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 386 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[385] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7734,8 +8791,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 387 FFV1_0( w_fp[21], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 387 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[386] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7748,8 +8808,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 388 VVV1_0( w_fp[62], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 388 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[387] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7768,8 +8831,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 389 FFV1_0( w_fp[21], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 389 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[388] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[51] -= amp_sv[0]; @@ -7784,8 +8850,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 390 VVV1_0( w_fp[86], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 390 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[389] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7804,8 +8873,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 391 FFV1_0( w_fp[22], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 391 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[390] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[53] -= amp_sv[0]; @@ -7854,8 +8926,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 393 FFV1_0( w_fp[104], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 393 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[392] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7868,8 +8943,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 394 FFV1_0( w_fp[52], w_fp[105], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 394 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[393] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7882,8 +8960,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 395 FFV1_0( w_fp[52], w_fp[39], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 395 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[394] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[75] += amp_sv[0]; jamp_sv[85] -= amp_sv[0]; @@ -7898,8 +8979,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 396 FFV1_0( w_fp[104], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 396 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[395] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7912,8 +8996,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 397 FFV1_0( w_fp[52], w_fp[106], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 397 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[396] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7926,8 +9013,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 398 FFV1_0( w_fp[52], w_fp[47], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 398 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[397] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[99] += amp_sv[0]; jamp_sv[109] -= amp_sv[0]; @@ -7942,8 +9032,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 399 FFV1_0( w_fp[104], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 399 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[398] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[91] += amp_sv[0]; jamp_sv[94] -= amp_sv[0]; @@ -7958,8 +9051,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 400 FFV1_0( w_fp[52], w_fp[102], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 400 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[399] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -7974,8 +9070,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 401 FFV1_0( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 401 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[400] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; @@ -7994,8 +9093,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 402 FFV1_0( w_fp[71], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 402 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[401] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -8010,8 +9112,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 403 FFV1_0( w_fp[3], w_fp[102], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 403 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[402] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8030,8 +9135,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 404 FFV1_0( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 404 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[403] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] += amp_sv[0]; jamp_sv[41] -= amp_sv[0]; @@ -8046,8 +9154,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 405 FFV1_0( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 405 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[404] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8066,8 +9177,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 406 FFV1_0( w_fp[3], w_fp[94], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 406 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[405] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8086,8 +9200,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 407 FFV1_0( w_fp[71], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 407 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[406] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8164,8 +9281,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 409 VVV1_0( w_fp[8], w_fp[6], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 409 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[408] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -8192,8 +9312,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 410 VVV1_0( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 410 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[409] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -8220,8 +9343,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 411 VVV1_0( w_fp[66], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 411 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[410] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= amp_sv[0]; jamp_sv[11] += amp_sv[0]; @@ -8248,8 +9374,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 412 FFV1_0( w_fp[3], w_fp[47], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 412 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[411] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8268,8 +9397,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 413 FFV1_0( w_fp[3], w_fp[106], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 413 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[412] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[98] += amp_sv[0]; jamp_sv[99] -= amp_sv[0]; @@ -8284,8 +9416,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 414 FFV1_0( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 414 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[413] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[106] += amp_sv[0]; jamp_sv[107] -= amp_sv[0]; @@ -8300,8 +9435,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 415 FFV1_0( w_fp[41], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 415 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[414] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8320,8 +9458,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 416 FFV1_0( w_fp[41], w_fp[102], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 416 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[415] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -8336,8 +9477,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 417 FFV1_0( w_fp[101], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 417 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[416] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] += amp_sv[0]; jamp_sv[38] -= amp_sv[0]; @@ -8352,8 +9496,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 418 FFV1_0( w_fp[76], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 418 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[417] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[9] -= amp_sv[0]; @@ -8368,8 +9515,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 419 FFV1_0( w_fp[3], w_fp[102], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 419 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[418] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8388,8 +9538,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 420 FFV1_0( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 420 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[419] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[46] += amp_sv[0]; jamp_sv[47] -= amp_sv[0]; @@ -8404,8 +9557,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 421 FFV1_0( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 421 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[420] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8424,8 +9580,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 422 FFV1_0( w_fp[3], w_fp[97], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 422 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[421] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8444,8 +9603,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 423 FFV1_0( w_fp[76], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 423 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[422] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8522,8 +9684,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 425 VVV1_0( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 425 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[424] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += amp_sv[0]; jamp_sv[10] -= amp_sv[0]; @@ -8550,8 +9715,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 426 VVV1_0( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 426 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[425] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -8578,8 +9746,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 427 VVV1_0( w_fp[72], w_fp[8], w_fp[62], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 427 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[426] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= amp_sv[0]; jamp_sv[9] += amp_sv[0]; @@ -8606,8 +9777,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 428 FFV1_0( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 428 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[427] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8626,8 +9800,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 429 FFV1_0( w_fp[3], w_fp[105], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 429 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[428] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[74] += amp_sv[0]; jamp_sv[75] -= amp_sv[0]; @@ -8642,8 +9819,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 430 FFV1_0( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 430 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[429] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[82] += amp_sv[0]; jamp_sv[83] -= amp_sv[0]; @@ -8658,8 +9838,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 431 FFV1_0( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 431 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[430] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8678,8 +9861,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 432 FFV1_0( w_fp[38], w_fp[102], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 432 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[431] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += amp_sv[0]; jamp_sv[10] -= amp_sv[0]; @@ -8694,8 +9880,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 433 FFV1_0( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 433 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[432] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] += amp_sv[0]; jamp_sv[44] -= amp_sv[0]; @@ -8710,8 +9899,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 434 VVV1_0( w_fp[104], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 434 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[433] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= amp_sv[0]; jamp_sv[25] += amp_sv[0]; @@ -8738,8 +9930,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 435 VVV1_0( w_fp[104], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 435 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[434] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= amp_sv[0]; jamp_sv[24] += amp_sv[0]; @@ -8824,8 +10019,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 437 VVV1_0( w_fp[62], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 437 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[436] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[24] -= amp_sv[0]; @@ -8852,8 +10050,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 438 VVV1_0( w_fp[62], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 438 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[437] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[24] -= amp_sv[0]; @@ -8938,8 +10139,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 440 VVV1_0( w_fp[86], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 440 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[439] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[25] -= amp_sv[0]; @@ -8966,8 +10170,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 441 VVV1_0( w_fp[86], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 441 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[440] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[25] -= amp_sv[0]; @@ -9288,8 +10495,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 447 VVV1_0( w_fp[8], w_fp[29], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 447 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[446] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -9316,8 +10526,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 448 VVV1_0( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 448 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[447] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -9344,8 +10557,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 449 VVV1_0( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 449 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[448] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] -= amp_sv[0]; jamp_sv[11] += amp_sv[0]; @@ -9372,8 +10588,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 450 VVV1_0( w_fp[104], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 450 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[449] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9392,8 +10611,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 451 FFV1_0( w_fp[3], w_fp[44], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 451 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[450] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[91] += amp_sv[0]; jamp_sv[92] -= amp_sv[0]; @@ -9408,8 +10630,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 452 FFV1_0( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 452 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[451] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9422,8 +10647,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 453 FFV1_0( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 453 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[452] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9436,8 +10664,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 454 FFV1_0( w_fp[3], w_fp[89], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 454 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[453] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[78] += amp_sv[0]; jamp_sv[80] -= amp_sv[0]; @@ -9452,8 +10683,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 455 VVV1_0( w_fp[86], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 455 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[454] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; @@ -9506,8 +10740,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 457 FFV1_0( w_fp[41], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 457 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[456] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[74] += amp_sv[0]; jamp_sv[78] -= amp_sv[0]; @@ -9522,8 +10759,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 458 FFV1_0( w_fp[41], w_fp[105], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 458 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[457] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9536,8 +10776,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 459 FFV1_0( w_fp[101], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 459 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[458] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9550,8 +10793,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 460 VVV1_0( w_fp[104], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 460 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[459] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9570,8 +10816,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 461 FFV1_0( w_fp[3], w_fp[50], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 461 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[460] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[115] += amp_sv[0]; jamp_sv[116] -= amp_sv[0]; @@ -9586,8 +10835,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 462 FFV1_0( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 462 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[461] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9600,8 +10852,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 463 FFV1_0( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 463 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[462] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9614,8 +10869,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 464 FFV1_0( w_fp[3], w_fp[91], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 464 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[463] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[102] += amp_sv[0]; jamp_sv[104] -= amp_sv[0]; @@ -9630,8 +10888,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 465 VVV1_0( w_fp[62], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 465 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[464] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; @@ -9684,8 +10945,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 467 FFV1_0( w_fp[38], w_fp[47], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 467 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[466] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[98] += amp_sv[0]; jamp_sv[102] -= amp_sv[0]; @@ -9700,8 +10964,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 468 FFV1_0( w_fp[38], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 468 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[467] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9714,8 +10981,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 469 FFV1_0( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 469 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[468] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9728,8 +10998,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 470 VVV1_0( w_fp[104], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 470 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[469] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9748,8 +11021,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 471 FFV1_0( w_fp[48], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 471 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[470] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += amp_sv[0]; jamp_sv[25] -= amp_sv[0]; @@ -9764,8 +11040,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 472 FFV1_0( w_fp[58], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 472 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[471] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9778,8 +11057,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 473 FFV1_0( w_fp[48], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 473 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[472] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9792,8 +11074,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 474 FFV1_0( w_fp[58], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 474 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[473] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[52] -= amp_sv[0]; @@ -9808,8 +11093,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 475 VVV1_0( w_fp[86], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 475 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[474] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; @@ -9862,8 +11150,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 477 VVV1_0( w_fp[104], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 477 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[476] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9882,8 +11173,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 478 FFV1_0( w_fp[40], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 478 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[477] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[24] -= amp_sv[0]; @@ -9898,8 +11192,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 479 FFV1_0( w_fp[60], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 479 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[478] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9912,8 +11209,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 480 FFV1_0( w_fp[40], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 480 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[479] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9926,8 +11226,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 481 FFV1_0( w_fp[60], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 481 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[480] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[50] -= amp_sv[0]; @@ -9942,8 +11245,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 482 VVV1_0( w_fp[62], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 482 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[481] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; @@ -9996,8 +11302,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 484 FFV1_0( w_fp[3], w_fp[18], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 484 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[483] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10016,8 +11325,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 485 FFV1_0( w_fp[12], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 485 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[484] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10036,8 +11348,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 486 FFV1_0( w_fp[3], w_fp[102], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 486 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[485] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10056,8 +11371,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 487 FFV1_0( w_fp[12], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 487 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[486] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -10072,8 +11390,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 488 FFV1_0( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 488 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[487] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10092,8 +11413,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 489 FFV1_0( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 489 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[488] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[92] += amp_sv[0]; jamp_sv[93] -= amp_sv[0]; @@ -10235,8 +11559,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 493 FFV1_0( w_fp[99], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 493 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[492] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10249,8 +11576,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 494 FFV1_0( w_fp[99], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 494 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[493] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10263,8 +11593,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 495 VVV1_0( w_fp[102], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 495 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[494] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10283,8 +11616,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 496 FFV1_0( w_fp[3], w_fp[85], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 496 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[495] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[43] += amp_sv[0]; jamp_sv[44] -= amp_sv[0]; @@ -10299,8 +11635,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 497 VVV1_0( w_fp[104], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 497 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[496] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10319,8 +11658,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 498 FFV1_0( w_fp[3], w_fp[87], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 498 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[497] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] += amp_sv[0]; jamp_sv[32] -= amp_sv[0]; @@ -10371,8 +11713,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 500 FFV1_0( w_fp[46], w_fp[62], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 500 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[499] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10385,8 +11730,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 501 FFV1_0( w_fp[114], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 501 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[500] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10399,8 +11747,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 502 FFV1_0( w_fp[46], w_fp[77], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 502 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[501] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[27] += amp_sv[0]; jamp_sv[37] -= amp_sv[0]; @@ -10415,8 +11766,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 503 FFV1_0( w_fp[41], w_fp[62], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 503 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[502] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10429,8 +11783,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 504 FFV1_0( w_fp[113], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 504 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[503] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10443,8 +11800,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 505 FFV1_0( w_fp[41], w_fp[77], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 505 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[504] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[26] += amp_sv[0]; jamp_sv[30] -= amp_sv[0]; @@ -10459,8 +11819,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 506 FFV1_0( w_fp[3], w_fp[62], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 506 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[505] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[26] += amp_sv[0]; jamp_sv[27] -= amp_sv[0]; @@ -10475,8 +11838,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 507 FFV1_0( w_fp[99], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 507 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[506] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] += amp_sv[0]; jamp_sv[35] -= amp_sv[0]; @@ -10491,8 +11857,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 508 FFV1_0( w_fp[3], w_fp[77], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 508 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[507] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10511,8 +11880,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 509 FFV1_0( w_fp[56], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 509 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[508] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10525,8 +11897,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 510 FFV1_0( w_fp[21], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 510 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[509] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10539,8 +11914,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 511 VVV1_0( w_fp[102], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 511 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[510] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10559,8 +11937,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 512 FFV1_0( w_fp[21], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 512 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[511] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += amp_sv[0]; jamp_sv[51] -= amp_sv[0]; @@ -10575,8 +11956,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 513 VVV1_0( w_fp[104], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 513 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[512] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10595,8 +11979,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 514 FFV1_0( w_fp[56], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 514 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[513] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += amp_sv[0]; jamp_sv[77] -= amp_sv[0]; @@ -10645,8 +12032,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 516 FFV1_0( w_fp[86], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 516 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[515] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10659,8 +12049,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 517 FFV1_0( w_fp[52], w_fp[98], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 517 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[516] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10673,8 +12066,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 518 FFV1_0( w_fp[52], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 518 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[517] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[51] += amp_sv[0]; jamp_sv[61] -= amp_sv[0]; @@ -10689,8 +12085,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 519 FFV1_0( w_fp[86], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 519 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[518] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10703,8 +12102,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 520 FFV1_0( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 520 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[519] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10717,8 +12119,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 521 FFV1_0( w_fp[52], w_fp[47], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 521 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[520] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[101] += amp_sv[0]; jamp_sv[109] -= amp_sv[0]; @@ -10733,8 +12138,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 522 FFV1_0( w_fp[86], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 522 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[521] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[67] += amp_sv[0]; jamp_sv[70] -= amp_sv[0]; @@ -10749,8 +12157,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 523 FFV1_0( w_fp[52], w_fp[112], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 523 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[522] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += amp_sv[0]; jamp_sv[17] -= amp_sv[0]; @@ -10765,8 +12176,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 524 FFV1_0( w_fp[52], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 524 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[523] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10785,8 +12199,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 525 FFV1_0( w_fp[65], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 525 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[524] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[17] -= amp_sv[0]; @@ -10801,8 +12218,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 526 FFV1_0( w_fp[3], w_fp[112], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 526 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[525] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10821,8 +12241,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 527 FFV1_0( w_fp[99], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 527 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[526] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] += amp_sv[0]; jamp_sv[35] -= amp_sv[0]; @@ -10837,8 +12260,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 528 FFV1_0( w_fp[99], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 528 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[527] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10857,8 +12283,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 529 FFV1_0( w_fp[3], w_fp[93], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 529 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[528] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10877,8 +12306,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 530 FFV1_0( w_fp[65], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 530 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[529] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10955,8 +12387,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 532 VVV1_0( w_fp[8], w_fp[6], w_fp[86], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 532 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[531] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -10983,8 +12418,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 533 VVV1_0( w_fp[61], w_fp[6], w_fp[101], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 533 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[532] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -11011,8 +12449,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 534 VVV1_0( w_fp[61], w_fp[8], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 534 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[533] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= amp_sv[0]; jamp_sv[17] += amp_sv[0]; @@ -11039,8 +12480,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 535 FFV1_0( w_fp[3], w_fp[47], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 535 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[534] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11059,8 +12503,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 536 FFV1_0( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 536 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[535] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[100] += amp_sv[0]; jamp_sv[101] -= amp_sv[0]; @@ -11075,8 +12522,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 537 FFV1_0( w_fp[99], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 537 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[536] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[104] += amp_sv[0]; jamp_sv[105] -= amp_sv[0]; @@ -11091,8 +12541,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 538 FFV1_0( w_fp[41], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 538 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[537] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11111,8 +12564,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 539 FFV1_0( w_fp[41], w_fp[112], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 539 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[538] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -11127,8 +12583,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 540 FFV1_0( w_fp[113], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 540 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[539] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] += amp_sv[0]; jamp_sv[32] -= amp_sv[0]; @@ -11143,8 +12602,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 541 FFV1_0( w_fp[76], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 541 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[540] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -11159,8 +12621,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 542 FFV1_0( w_fp[3], w_fp[112], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 542 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[541] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11179,8 +12644,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 543 FFV1_0( w_fp[99], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 543 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[542] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[44] += amp_sv[0]; jamp_sv[45] -= amp_sv[0]; @@ -11195,8 +12663,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 544 FFV1_0( w_fp[99], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 544 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[543] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11215,8 +12686,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 545 FFV1_0( w_fp[3], w_fp[97], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 545 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[544] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11235,8 +12709,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 546 FFV1_0( w_fp[76], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 546 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[545] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11313,8 +12790,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 548 VVV1_0( w_fp[8], w_fp[4], w_fp[86], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 548 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[547] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += amp_sv[0]; jamp_sv[16] -= amp_sv[0]; @@ -11341,8 +12821,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 549 VVV1_0( w_fp[72], w_fp[4], w_fp[101], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 549 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[548] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -11369,8 +12852,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 550 VVV1_0( w_fp[72], w_fp[8], w_fp[102], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 550 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[549] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] -= amp_sv[0]; jamp_sv[15] += amp_sv[0]; @@ -11397,8 +12883,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 551 FFV1_0( w_fp[3], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 551 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[550] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11417,8 +12906,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 552 FFV1_0( w_fp[3], w_fp[98], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 552 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[551] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[50] += amp_sv[0]; jamp_sv[51] -= amp_sv[0]; @@ -11433,8 +12925,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 553 FFV1_0( w_fp[99], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 553 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[552] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[58] += amp_sv[0]; jamp_sv[59] -= amp_sv[0]; @@ -11449,8 +12944,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 554 FFV1_0( w_fp[46], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 554 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[553] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11469,8 +12967,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 555 FFV1_0( w_fp[46], w_fp[112], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 555 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[554] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += amp_sv[0]; jamp_sv[16] -= amp_sv[0]; @@ -11485,8 +12986,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 556 FFV1_0( w_fp[114], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 556 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[555] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[43] += amp_sv[0]; jamp_sv[46] -= amp_sv[0]; @@ -11501,8 +13005,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 557 VVV1_0( w_fp[86], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 557 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[556] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] -= amp_sv[0]; jamp_sv[27] += amp_sv[0]; @@ -11529,8 +13036,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 558 VVV1_0( w_fp[86], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 558 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[557] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= amp_sv[0]; jamp_sv[26] += amp_sv[0]; @@ -11615,8 +13125,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 560 VVV1_0( w_fp[102], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 560 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[559] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += amp_sv[0]; jamp_sv[26] -= amp_sv[0]; @@ -11643,8 +13156,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 561 VVV1_0( w_fp[102], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 561 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[560] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] += amp_sv[0]; jamp_sv[26] -= amp_sv[0]; @@ -11729,8 +13245,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 563 VVV1_0( w_fp[104], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 563 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[562] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += amp_sv[0]; jamp_sv[27] -= amp_sv[0]; @@ -11757,8 +13276,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 564 VVV1_0( w_fp[104], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 564 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[563] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[27] -= amp_sv[0]; @@ -12079,8 +13601,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 570 VVV1_0( w_fp[8], w_fp[27], w_fp[86], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 570 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[569] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -12107,8 +13632,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 571 VVV1_0( w_fp[1], w_fp[27], w_fp[101], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 571 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[570] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -12135,8 +13663,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 572 VVV1_0( w_fp[1], w_fp[8], w_fp[62], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 572 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[571] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] -= amp_sv[0]; jamp_sv[17] += amp_sv[0]; @@ -12163,8 +13694,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 573 VVV1_0( w_fp[86], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 573 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[572] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12183,8 +13717,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 574 FFV1_0( w_fp[3], w_fp[36], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 574 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[573] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[67] += amp_sv[0]; jamp_sv[68] -= amp_sv[0]; @@ -12199,8 +13736,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 575 FFV1_0( w_fp[99], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 575 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[574] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12213,8 +13753,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 576 FFV1_0( w_fp[99], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 576 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[575] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12227,8 +13770,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 577 FFV1_0( w_fp[3], w_fp[100], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 577 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[576] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[54] += amp_sv[0]; jamp_sv[56] -= amp_sv[0]; @@ -12243,8 +13789,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 578 VVV1_0( w_fp[104], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 578 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[577] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12297,8 +13846,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 580 FFV1_0( w_fp[41], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 580 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[579] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[50] += amp_sv[0]; jamp_sv[54] -= amp_sv[0]; @@ -12313,8 +13865,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 581 FFV1_0( w_fp[41], w_fp[98], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 581 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[580] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12327,8 +13882,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 582 FFV1_0( w_fp[113], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 582 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[581] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12341,8 +13899,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 583 VVV1_0( w_fp[86], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 583 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[582] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12361,8 +13922,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 584 FFV1_0( w_fp[3], w_fp[49], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 584 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[583] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[109] += amp_sv[0]; jamp_sv[110] -= amp_sv[0]; @@ -12377,8 +13941,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 585 FFV1_0( w_fp[99], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 585 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[584] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12391,8 +13958,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 586 FFV1_0( w_fp[99], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 586 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[585] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12405,8 +13975,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 587 FFV1_0( w_fp[3], w_fp[91], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 587 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[586] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[103] += amp_sv[0]; jamp_sv[104] -= amp_sv[0]; @@ -12421,8 +13994,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 588 VVV1_0( w_fp[102], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 588 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[587] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12475,8 +14051,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 590 FFV1_0( w_fp[46], w_fp[47], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 590 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[589] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[100] += amp_sv[0]; jamp_sv[103] -= amp_sv[0]; @@ -12491,8 +14070,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 591 FFV1_0( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 591 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[590] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12505,8 +14087,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 592 FFV1_0( w_fp[114], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 592 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[591] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12519,8 +14104,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 593 VVV1_0( w_fp[86], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 593 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[592] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12539,8 +14127,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 594 FFV1_0( w_fp[53], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 594 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[593] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += amp_sv[0]; jamp_sv[27] -= amp_sv[0]; @@ -12555,8 +14146,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 595 FFV1_0( w_fp[78], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 595 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[594] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12569,8 +14163,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 596 FFV1_0( w_fp[53], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 596 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[595] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12583,8 +14180,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 597 FFV1_0( w_fp[78], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 597 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[596] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[76] -= amp_sv[0]; @@ -12599,8 +14199,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 598 VVV1_0( w_fp[104], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 598 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[597] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12653,8 +14256,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 600 VVV1_0( w_fp[86], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 600 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[599] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12673,8 +14279,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 601 FFV1_0( w_fp[28], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 601 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[600] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[26] -= amp_sv[0]; @@ -12689,8 +14298,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 602 FFV1_0( w_fp[60], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 602 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[601] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12703,8 +14315,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 603 FFV1_0( w_fp[28], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 603 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[602] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12717,8 +14332,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 604 FFV1_0( w_fp[60], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 604 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[603] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] += amp_sv[0]; jamp_sv[50] -= amp_sv[0]; @@ -12733,8 +14351,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 605 VVV1_0( w_fp[102], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 605 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[604] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12787,8 +14408,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 607 FFV1_0( w_fp[3], w_fp[15], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 607 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[606] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12807,8 +14431,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 608 FFV1_0( w_fp[14], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 608 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[607] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12827,8 +14454,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 609 FFV1_0( w_fp[3], w_fp[112], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 609 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[608] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12847,8 +14477,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 610 FFV1_0( w_fp[14], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 610 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[609] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -12863,8 +14496,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 611 FFV1_0( w_fp[99], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 611 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[610] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12883,8 +14519,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 612 FFV1_0( w_fp[99], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 612 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[611] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[68] += amp_sv[0]; jamp_sv[69] -= amp_sv[0]; @@ -13026,8 +14665,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 616 FFV1_0( w_fp[99], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 616 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[615] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13040,8 +14682,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 617 FFV1_0( w_fp[99], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 617 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[616] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13054,8 +14699,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 618 VVV1_0( w_fp[112], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 618 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[617] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13074,8 +14722,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 619 FFV1_0( w_fp[3], w_fp[9], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 619 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[618] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[37] += amp_sv[0]; jamp_sv[38] -= amp_sv[0]; @@ -13090,8 +14741,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 620 VVV1_0( w_fp[86], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 620 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[619] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13110,8 +14764,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 621 FFV1_0( w_fp[3], w_fp[87], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 621 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[620] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[31] += amp_sv[0]; jamp_sv[32] -= amp_sv[0]; @@ -13162,8 +14819,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 623 FFV1_0( w_fp[46], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 623 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[622] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13176,8 +14836,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 624 FFV1_0( w_fp[88], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 624 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[623] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13190,8 +14853,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 625 FFV1_0( w_fp[46], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 625 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[624] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[29] += amp_sv[0]; jamp_sv[37] -= amp_sv[0]; @@ -13206,8 +14872,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 626 FFV1_0( w_fp[38], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 626 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[625] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13220,8 +14889,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 627 FFV1_0( w_fp[90], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 627 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[626] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13234,8 +14906,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 628 FFV1_0( w_fp[38], w_fp[77], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 628 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[627] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[28] += amp_sv[0]; jamp_sv[31] -= amp_sv[0]; @@ -13250,8 +14925,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 629 FFV1_0( w_fp[3], w_fp[102], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 629 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[628] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[28] += amp_sv[0]; jamp_sv[29] -= amp_sv[0]; @@ -13266,8 +14944,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 630 FFV1_0( w_fp[99], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 630 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[629] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] += amp_sv[0]; jamp_sv[33] -= amp_sv[0]; @@ -13282,8 +14963,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 631 FFV1_0( w_fp[3], w_fp[77], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 631 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[630] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13302,8 +14986,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 632 FFV1_0( w_fp[56], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 632 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[631] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13316,8 +15003,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 633 FFV1_0( w_fp[22], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 633 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[632] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13330,8 +15020,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 634 VVV1_0( w_fp[112], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 634 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[633] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13350,8 +15043,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 635 FFV1_0( w_fp[22], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 635 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[634] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += amp_sv[0]; jamp_sv[53] -= amp_sv[0]; @@ -13366,8 +15062,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 636 VVV1_0( w_fp[86], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 636 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[635] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13386,8 +15085,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 637 FFV1_0( w_fp[56], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 637 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[636] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[23] += amp_sv[0]; jamp_sv[77] -= amp_sv[0]; @@ -13436,8 +15138,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 639 FFV1_0( w_fp[104], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 639 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[638] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13450,8 +15155,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 640 FFV1_0( w_fp[52], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 640 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[639] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13464,8 +15172,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 641 FFV1_0( w_fp[52], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 641 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[640] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[53] += amp_sv[0]; jamp_sv[61] -= amp_sv[0]; @@ -13480,8 +15191,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 642 FFV1_0( w_fp[104], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 642 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[641] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13494,8 +15208,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 643 FFV1_0( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 643 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[642] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13508,8 +15225,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 644 FFV1_0( w_fp[52], w_fp[39], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 644 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[643] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[77] += amp_sv[0]; jamp_sv[85] -= amp_sv[0]; @@ -13524,8 +15244,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 645 FFV1_0( w_fp[104], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 645 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[644] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[61] += amp_sv[0]; jamp_sv[64] -= amp_sv[0]; @@ -13540,8 +15263,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 646 FFV1_0( w_fp[52], w_fp[96], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 646 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[645] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += amp_sv[0]; jamp_sv[23] -= amp_sv[0]; @@ -13556,8 +15282,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 647 FFV1_0( w_fp[52], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 647 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[646] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13576,8 +15305,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 648 FFV1_0( w_fp[65], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 648 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[647] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] += amp_sv[0]; jamp_sv[23] -= amp_sv[0]; @@ -13592,8 +15324,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 649 FFV1_0( w_fp[3], w_fp[96], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 649 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[648] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13612,8 +15347,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 650 FFV1_0( w_fp[99], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 650 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[649] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] += amp_sv[0]; jamp_sv[33] -= amp_sv[0]; @@ -13628,8 +15366,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 651 FFV1_0( w_fp[99], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 651 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[650] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13648,8 +15389,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 652 FFV1_0( w_fp[3], w_fp[93], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 652 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[651] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13668,8 +15412,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 653 FFV1_0( w_fp[65], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 653 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[652] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13746,8 +15493,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 655 VVV1_0( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 655 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[654] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -13774,8 +15524,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 656 VVV1_0( w_fp[61], w_fp[5], w_fp[113], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 656 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[655] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -13802,8 +15555,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 657 VVV1_0( w_fp[61], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 657 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[656] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] -= amp_sv[0]; jamp_sv[23] += amp_sv[0]; @@ -13830,8 +15586,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 658 FFV1_0( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 658 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[657] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13850,8 +15609,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 659 FFV1_0( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 659 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[658] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[76] += amp_sv[0]; jamp_sv[77] -= amp_sv[0]; @@ -13866,8 +15628,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 660 FFV1_0( w_fp[99], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 660 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[659] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[80] += amp_sv[0]; jamp_sv[81] -= amp_sv[0]; @@ -13882,8 +15647,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 661 FFV1_0( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 661 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[660] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13902,8 +15670,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 662 FFV1_0( w_fp[38], w_fp[96], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 662 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[661] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -13918,8 +15689,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 663 FFV1_0( w_fp[90], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 663 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[662] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[31] += amp_sv[0]; jamp_sv[34] -= amp_sv[0]; @@ -13934,8 +15708,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 664 FFV1_0( w_fp[71], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 664 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[663] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] += amp_sv[0]; jamp_sv[21] -= amp_sv[0]; @@ -13950,8 +15727,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 665 FFV1_0( w_fp[3], w_fp[96], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 665 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[664] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13970,8 +15750,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 666 FFV1_0( w_fp[99], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 666 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[665] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[38] += amp_sv[0]; jamp_sv[39] -= amp_sv[0]; @@ -13986,8 +15769,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 667 FFV1_0( w_fp[99], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 667 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[666] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; @@ -14006,8 +15792,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 668 FFV1_0( w_fp[3], w_fp[94], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 668 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[667] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; @@ -14026,8 +15815,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 669 FFV1_0( w_fp[71], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 669 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[668] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; @@ -14104,8 +15896,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 671 VVV1_0( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 671 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[670] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += amp_sv[0]; jamp_sv[22] -= amp_sv[0]; @@ -14132,8 +15927,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 672 VVV1_0( w_fp[66], w_fp[4], w_fp[113], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 672 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[671] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -14160,8 +15958,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 673 VVV1_0( w_fp[66], w_fp[8], w_fp[112], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 673 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[672] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] -= amp_sv[0]; jamp_sv[21] += amp_sv[0]; @@ -14188,8 +15989,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 674 FFV1_0( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 674 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[673] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; @@ -14208,8 +16012,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 675 FFV1_0( w_fp[3], w_fp[114], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 675 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[674] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[52] += amp_sv[0]; jamp_sv[53] -= amp_sv[0]; @@ -14224,8 +16031,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 676 FFV1_0( w_fp[99], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 676 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[675] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[56] += amp_sv[0]; jamp_sv[57] -= amp_sv[0]; @@ -14240,8 +16050,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 677 FFV1_0( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 677 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[676] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; @@ -14260,8 +16073,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 678 FFV1_0( w_fp[46], w_fp[96], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 678 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[677] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += amp_sv[0]; jamp_sv[22] -= amp_sv[0]; @@ -14276,8 +16092,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 679 FFV1_0( w_fp[88], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 679 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[678] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[37] += amp_sv[0]; jamp_sv[40] -= amp_sv[0]; @@ -14292,8 +16111,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 680 VVV1_0( w_fp[104], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 680 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[679] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] -= amp_sv[0]; jamp_sv[29] += amp_sv[0]; @@ -14320,8 +16142,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 681 VVV1_0( w_fp[104], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 681 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[680] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= amp_sv[0]; jamp_sv[28] += amp_sv[0]; @@ -14406,8 +16231,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 683 VVV1_0( w_fp[112], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 683 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[682] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += amp_sv[0]; jamp_sv[28] -= amp_sv[0]; @@ -14434,8 +16262,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 684 VVV1_0( w_fp[112], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 684 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[683] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] += amp_sv[0]; jamp_sv[28] -= amp_sv[0]; @@ -14520,8 +16351,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 686 VVV1_0( w_fp[86], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 686 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[685] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[23] += amp_sv[0]; jamp_sv[29] -= amp_sv[0]; @@ -14548,8 +16382,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 687 VVV1_0( w_fp[86], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 687 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[686] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] += amp_sv[0]; jamp_sv[29] -= amp_sv[0]; @@ -14870,8 +16707,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 693 VVV1_0( w_fp[8], w_fp[24], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 693 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[692] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[19] -= amp_sv[0]; @@ -14898,8 +16738,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 694 VVV1_0( w_fp[1], w_fp[24], w_fp[113], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 694 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[693] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[19] -= amp_sv[0]; @@ -14926,8 +16769,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 695 VVV1_0( w_fp[1], w_fp[8], w_fp[102], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 695 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[694] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] -= amp_sv[0]; jamp_sv[23] += amp_sv[0]; @@ -14954,8 +16800,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 696 VVV1_0( w_fp[104], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 696 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[695] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -14974,8 +16823,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 697 FFV1_0( w_fp[3], w_fp[35], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 697 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[696] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[61] += amp_sv[0]; jamp_sv[62] -= amp_sv[0]; @@ -14990,8 +16842,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 698 FFV1_0( w_fp[99], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 698 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[697] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15004,8 +16859,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 699 FFV1_0( w_fp[99], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 699 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[698] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15018,8 +16876,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 700 FFV1_0( w_fp[3], w_fp[100], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 700 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[699] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[55] += amp_sv[0]; jamp_sv[56] -= amp_sv[0]; @@ -15034,8 +16895,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 701 VVV1_0( w_fp[86], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 701 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[700] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15088,8 +16952,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 703 FFV1_0( w_fp[38], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 703 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[702] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[52] += amp_sv[0]; jamp_sv[55] -= amp_sv[0]; @@ -15104,8 +16971,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 704 FFV1_0( w_fp[38], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 704 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[703] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15118,8 +16988,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 705 FFV1_0( w_fp[90], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 705 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[704] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15132,8 +17005,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 706 VVV1_0( w_fp[104], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 706 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[705] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15152,8 +17028,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 707 FFV1_0( w_fp[3], w_fp[43], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 707 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[706] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[85] += amp_sv[0]; jamp_sv[86] -= amp_sv[0]; @@ -15168,8 +17047,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 708 FFV1_0( w_fp[99], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 708 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[707] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15182,8 +17064,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 709 FFV1_0( w_fp[99], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 709 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[708] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15196,8 +17081,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 710 FFV1_0( w_fp[3], w_fp[89], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 710 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[709] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[79] += amp_sv[0]; jamp_sv[80] -= amp_sv[0]; @@ -15212,8 +17100,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 711 VVV1_0( w_fp[112], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 711 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[710] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15266,8 +17157,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 713 FFV1_0( w_fp[46], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 713 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[712] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[76] += amp_sv[0]; jamp_sv[79] -= amp_sv[0]; @@ -15282,8 +17176,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 714 FFV1_0( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 714 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[713] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15296,8 +17193,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 715 FFV1_0( w_fp[88], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 715 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[714] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15310,8 +17210,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 716 VVV1_0( w_fp[104], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 716 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[715] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15330,8 +17233,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 717 FFV1_0( w_fp[7], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 717 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[716] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += amp_sv[0]; jamp_sv[29] -= amp_sv[0]; @@ -15346,8 +17252,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 718 FFV1_0( w_fp[78], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 718 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[717] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15360,8 +17269,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 719 FFV1_0( w_fp[7], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 719 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[718] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15374,8 +17286,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 720 FFV1_0( w_fp[78], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 720 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[719] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] += amp_sv[0]; jamp_sv[76] -= amp_sv[0]; @@ -15390,8 +17305,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 721 VVV1_0( w_fp[86], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 721 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[720] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15444,8 +17362,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 723 VVV1_0( w_fp[104], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 723 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[722] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15464,8 +17385,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 724 FFV1_0( w_fp[25], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 724 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[723] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[28] -= amp_sv[0]; @@ -15480,8 +17404,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 725 FFV1_0( w_fp[58], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 725 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[724] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15494,8 +17421,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 726 FFV1_0( w_fp[25], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 726 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[725] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15508,8 +17438,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 727 FFV1_0( w_fp[58], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 727 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[726] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] += amp_sv[0]; jamp_sv[52] -= amp_sv[0]; @@ -15524,8 +17457,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 728 VVV1_0( w_fp[112], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 728 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[727] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15578,8 +17514,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 730 FFV1_0( w_fp[3], w_fp[17], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 730 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[729] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15598,8 +17537,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 731 FFV1_0( w_fp[26], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 731 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[730] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15618,8 +17560,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 732 FFV1_0( w_fp[3], w_fp[96], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 732 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[731] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15638,8 +17583,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 733 FFV1_0( w_fp[26], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 733 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[732] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[19] -= amp_sv[0]; @@ -15654,8 +17602,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 734 FFV1_0( w_fp[99], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 734 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[733] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15674,8 +17625,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 735 FFV1_0( w_fp[99], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 735 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[734] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[62] += amp_sv[0]; jamp_sv[63] -= amp_sv[0]; @@ -15816,8 +17770,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 739 FFV1_0( w_fp[7], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 739 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[738] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[29] -= amp_sv[0]; @@ -15829,8 +17786,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 740 FFV1_0( w_fp[53], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 740 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[739] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[27] -= amp_sv[0]; @@ -15842,8 +17802,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 741 FFV1_0( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 741 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[740] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] -= amp_sv[0]; @@ -15855,8 +17818,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 742 FFV1_0( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 742 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[741] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[46] -= amp_sv[0]; @@ -15868,8 +17834,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 743 FFV1_0( w_fp[53], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 743 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[742] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[37] -= amp_sv[0]; @@ -15881,8 +17850,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 744 FFV1_0( w_fp[7], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 744 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[743] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[43] -= amp_sv[0]; @@ -15894,8 +17866,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 745 FFV1_0( w_fp[46], w_fp[92], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 745 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[744] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15908,8 +17883,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 746 FFV1_0( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 746 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[745] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15922,8 +17900,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 747 FFV1_0( w_fp[46], w_fp[77], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 747 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[746] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[27] += amp_sv[0]; jamp_sv[29] -= amp_sv[0]; @@ -15938,8 +17919,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 748 FFV1_0( w_fp[25], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 748 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[747] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[28] -= amp_sv[0]; @@ -15951,8 +17935,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 749 FFV1_0( w_fp[48], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 749 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[748] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] -= amp_sv[0]; @@ -15964,8 +17951,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 750 FFV1_0( w_fp[104], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 750 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[749] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] -= amp_sv[0]; @@ -15977,8 +17967,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 751 FFV1_0( w_fp[104], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 751 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[750] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[44] -= amp_sv[0]; @@ -15990,8 +17983,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 752 FFV1_0( w_fp[48], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 752 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[751] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[31] -= amp_sv[0]; @@ -16003,8 +17999,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 753 FFV1_0( w_fp[25], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 753 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[752] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] -= amp_sv[0]; @@ -16016,8 +18015,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 754 FFV1_0( w_fp[38], w_fp[92], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 754 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[753] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16030,8 +18032,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 755 FFV1_0( w_fp[104], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 755 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[754] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16044,8 +18049,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 756 FFV1_0( w_fp[38], w_fp[77], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 756 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[755] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += amp_sv[0]; jamp_sv[28] -= amp_sv[0]; @@ -16060,8 +18068,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 757 FFV1_0( w_fp[28], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 757 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[756] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[26] -= amp_sv[0]; @@ -16073,8 +18084,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 758 FFV1_0( w_fp[40], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 758 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[757] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] -= amp_sv[0]; @@ -16086,8 +18100,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 759 FFV1_0( w_fp[62], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 759 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[758] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] -= amp_sv[0]; @@ -16099,8 +18116,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 760 FFV1_0( w_fp[62], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 760 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[759] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[38] -= amp_sv[0]; @@ -16112,8 +18132,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 761 FFV1_0( w_fp[40], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 761 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[760] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] -= amp_sv[0]; @@ -16125,8 +18148,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 762 FFV1_0( w_fp[28], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 762 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[761] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] -= amp_sv[0]; @@ -16138,8 +18164,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 763 FFV1_0( w_fp[41], w_fp[92], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 763 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[762] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16152,8 +18181,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 764 FFV1_0( w_fp[62], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 764 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[763] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16166,8 +18198,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 765 FFV1_0( w_fp[41], w_fp[77], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 765 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[764] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += amp_sv[0]; jamp_sv[26] -= amp_sv[0]; @@ -16182,8 +18217,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 766 FFV1_0( w_fp[26], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 766 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[765] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16196,8 +18234,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 767 FFV1_0( w_fp[3], w_fp[92], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 767 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[766] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += amp_sv[0]; jamp_sv[26] -= amp_sv[0]; @@ -16212,8 +18253,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 768 VVV1_0( w_fp[98], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 768 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[767] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16232,8 +18276,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 769 FFV1_0( w_fp[3], w_fp[85], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 769 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[768] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] += amp_sv[0]; jamp_sv[43] -= amp_sv[0]; @@ -16248,8 +18295,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 770 VVV1_0( w_fp[0], w_fp[34], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 770 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[769] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16268,8 +18318,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 771 FFV1_0( w_fp[26], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 771 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[770] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16318,8 +18371,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 773 FFV1_0( w_fp[14], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 773 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[772] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16332,8 +18388,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 774 FFV1_0( w_fp[3], w_fp[92], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 774 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[773] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += amp_sv[0]; jamp_sv[26] -= amp_sv[0]; @@ -16348,8 +18407,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 775 VVV1_0( w_fp[101], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 775 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[774] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16368,8 +18430,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 776 FFV1_0( w_fp[3], w_fp[9], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 776 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[775] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] += amp_sv[0]; jamp_sv[37] -= amp_sv[0]; @@ -16384,8 +18449,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 777 VVV1_0( w_fp[0], w_fp[34], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 777 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[776] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16404,8 +18472,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 778 FFV1_0( w_fp[14], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 778 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[777] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16454,8 +18525,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 780 FFV1_0( w_fp[12], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 780 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[779] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16468,8 +18542,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 781 FFV1_0( w_fp[3], w_fp[92], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 781 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[780] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += amp_sv[0]; jamp_sv[25] -= amp_sv[0]; @@ -16484,8 +18561,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 782 VVV1_0( w_fp[96], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 782 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[781] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16504,8 +18584,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 783 FFV1_0( w_fp[3], w_fp[87], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 783 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[782] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] += amp_sv[0]; jamp_sv[31] -= amp_sv[0]; @@ -16520,8 +18603,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 784 VVV1_0( w_fp[0], w_fp[34], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 784 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[783] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16540,8 +18626,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 785 FFV1_0( w_fp[12], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 785 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[784] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16648,8 +18737,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 789 FFV1_0( w_fp[90], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 789 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[788] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] -= amp_sv[0]; @@ -16661,8 +18753,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 790 FFV1_0( w_fp[90], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 790 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[789] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[70] -= amp_sv[0]; @@ -16674,8 +18769,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 791 FFV1_0( w_fp[22], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 791 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[790] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[53] -= amp_sv[0]; @@ -16687,8 +18785,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 792 FFV1_0( w_fp[21], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 792 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[791] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[51] -= amp_sv[0]; @@ -16700,8 +18801,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 793 FFV1_0( w_fp[22], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 793 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[792] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[67] -= amp_sv[0]; @@ -16713,8 +18817,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 794 FFV1_0( w_fp[21], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 794 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[793] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[61] -= amp_sv[0]; @@ -16726,8 +18833,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 795 FFV1_0( w_fp[90], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 795 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[794] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16740,8 +18850,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 796 FFV1_0( w_fp[52], w_fp[114], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 796 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[795] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16754,8 +18867,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 797 FFV1_0( w_fp[52], w_fp[33], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 797 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[796] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[51] += amp_sv[0]; jamp_sv[53] -= amp_sv[0]; @@ -16770,8 +18886,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 798 FFV1_0( w_fp[90], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 798 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[797] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[88] -= amp_sv[0]; @@ -16783,8 +18902,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 799 FFV1_0( w_fp[90], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 799 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[798] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[94] -= amp_sv[0]; @@ -16796,8 +18918,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 800 FFV1_0( w_fp[56], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 800 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[799] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[77] -= amp_sv[0]; @@ -16809,8 +18934,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 801 FFV1_0( w_fp[21], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 801 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[800] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[75] -= amp_sv[0]; @@ -16822,8 +18950,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 802 FFV1_0( w_fp[56], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 802 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[801] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[91] -= amp_sv[0]; @@ -16835,8 +18966,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 803 FFV1_0( w_fp[21], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 803 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[802] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[85] -= amp_sv[0]; @@ -16848,8 +18982,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 804 FFV1_0( w_fp[90], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 804 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[803] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16862,8 +18999,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 805 FFV1_0( w_fp[52], w_fp[102], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 805 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[804] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16876,8 +19016,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 806 FFV1_0( w_fp[52], w_fp[39], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 806 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[805] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[75] += amp_sv[0]; jamp_sv[77] -= amp_sv[0]; @@ -16892,8 +19035,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 807 FFV1_0( w_fp[90], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 807 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[806] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[112] -= amp_sv[0]; @@ -16905,8 +19051,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 808 FFV1_0( w_fp[90], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 808 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[807] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[118] -= amp_sv[0]; @@ -16918,8 +19067,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 809 FFV1_0( w_fp[56], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 809 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[808] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[101] -= amp_sv[0]; @@ -16931,8 +19083,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 810 FFV1_0( w_fp[22], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 810 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[809] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[99] -= amp_sv[0]; @@ -16944,8 +19099,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 811 FFV1_0( w_fp[56], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 811 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[810] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[115] -= amp_sv[0]; @@ -16957,8 +19115,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 812 FFV1_0( w_fp[22], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 812 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[811] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[109] -= amp_sv[0]; @@ -16970,8 +19131,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 813 FFV1_0( w_fp[90], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 813 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[812] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16984,8 +19148,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 814 FFV1_0( w_fp[52], w_fp[113], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 814 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[813] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16998,8 +19165,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 815 FFV1_0( w_fp[52], w_fp[47], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 815 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[814] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[99] += amp_sv[0]; jamp_sv[101] -= amp_sv[0]; @@ -17014,8 +19184,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 816 FFV1_0( w_fp[90], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 816 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[815] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17028,8 +19201,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 817 FFV1_0( w_fp[90], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 817 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[816] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] += amp_sv[0]; jamp_sv[88] -= amp_sv[0]; @@ -17044,8 +19220,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 818 VVV1_0( w_fp[98], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 818 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[817] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17064,8 +19243,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 819 FFV1_0( w_fp[21], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 819 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[818] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -17080,8 +19262,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 820 VVV1_0( w_fp[0], w_fp[103], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 820 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[819] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17100,8 +19285,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 821 FFV1_0( w_fp[21], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 821 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[820] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17148,8 +19336,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 823 FFV1_0( w_fp[90], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 823 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[822] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17162,8 +19353,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 824 FFV1_0( w_fp[90], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 824 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[823] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[70] += amp_sv[0]; jamp_sv[88] -= amp_sv[0]; @@ -17178,8 +19372,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 825 VVV1_0( w_fp[101], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 825 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[824] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17198,8 +19395,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 826 FFV1_0( w_fp[22], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 826 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[825] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[21] -= amp_sv[0]; @@ -17214,8 +19414,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 827 VVV1_0( w_fp[0], w_fp[103], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 827 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[826] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17234,8 +19437,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 828 FFV1_0( w_fp[22], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 828 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[827] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17282,8 +19488,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 830 FFV1_0( w_fp[90], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 830 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[829] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17296,8 +19505,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 831 FFV1_0( w_fp[90], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 831 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[830] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] += amp_sv[0]; jamp_sv[70] -= amp_sv[0]; @@ -17312,8 +19524,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 832 VVV1_0( w_fp[96], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 832 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[831] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17332,8 +19547,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 833 FFV1_0( w_fp[56], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 833 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[832] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += amp_sv[0]; jamp_sv[23] -= amp_sv[0]; @@ -17348,8 +19566,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 834 VVV1_0( w_fp[0], w_fp[103], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 834 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[833] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17368,8 +19589,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 835 FFV1_0( w_fp[56], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 835 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[834] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17472,8 +19696,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 839 VVV1_0( w_fp[90], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 839 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[838] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; jamp_sv[7] += amp_sv[0]; @@ -17500,8 +19727,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 840 VVV1_0( w_fp[90], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 840 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[839] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; jamp_sv[6] += amp_sv[0]; @@ -17586,8 +19816,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 842 VVV1_0( w_fp[56], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 842 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[841] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -17614,8 +19847,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 843 VVV1_0( w_fp[56], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 843 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[842] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -17700,8 +19936,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 845 VVV1_0( w_fp[0], w_fp[63], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 845 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[844] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -17728,8 +19967,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 846 VVV1_0( w_fp[0], w_fp[64], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 846 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[845] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -18054,8 +20296,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 852 VVV1_0( w_fp[8], w_fp[29], w_fp[90], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 852 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[851] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -18082,8 +20327,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 853 VVV1_0( w_fp[61], w_fp[29], w_fp[56], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 853 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[852] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -18110,8 +20358,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 854 VVV1_0( w_fp[61], w_fp[8], w_fp[96], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 854 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[853] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= amp_sv[0]; jamp_sv[17] += amp_sv[0]; @@ -18138,8 +20389,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 855 VVV1_0( w_fp[90], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 855 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[854] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18158,8 +20412,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 856 FFV1_0( w_fp[3], w_fp[44], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 856 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[855] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] += amp_sv[0]; jamp_sv[91] -= amp_sv[0]; @@ -18174,8 +20431,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 857 FFV1_0( w_fp[65], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 857 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[856] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18188,8 +20448,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 858 FFV1_0( w_fp[3], w_fp[102], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 858 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[857] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += amp_sv[0]; jamp_sv[74] -= amp_sv[0]; @@ -18204,8 +20467,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 859 FFV1_0( w_fp[65], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 859 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[858] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18218,8 +20484,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 860 VVV1_0( w_fp[0], w_fp[64], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 860 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[859] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18272,8 +20541,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 862 FFV1_0( w_fp[41], w_fp[39], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 862 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[861] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += amp_sv[0]; jamp_sv[74] -= amp_sv[0]; @@ -18288,8 +20560,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 863 FFV1_0( w_fp[41], w_fp[102], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 863 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[862] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18302,8 +20577,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 864 FFV1_0( w_fp[62], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 864 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[863] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18316,8 +20594,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 865 VVV1_0( w_fp[90], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 865 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[864] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18336,8 +20617,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 866 FFV1_0( w_fp[3], w_fp[50], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 866 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[865] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[114] += amp_sv[0]; jamp_sv[115] -= amp_sv[0]; @@ -18352,8 +20636,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 867 FFV1_0( w_fp[65], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 867 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[866] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18366,8 +20653,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 868 FFV1_0( w_fp[3], w_fp[113], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 868 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[867] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += amp_sv[0]; jamp_sv[98] -= amp_sv[0]; @@ -18382,8 +20672,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 869 FFV1_0( w_fp[65], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 869 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[868] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18396,8 +20689,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 870 VVV1_0( w_fp[0], w_fp[63], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 870 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[869] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18450,8 +20746,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 872 FFV1_0( w_fp[38], w_fp[47], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 872 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[871] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += amp_sv[0]; jamp_sv[98] -= amp_sv[0]; @@ -18466,8 +20765,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 873 FFV1_0( w_fp[38], w_fp[113], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 873 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[872] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18480,8 +20782,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 874 FFV1_0( w_fp[104], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 874 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[873] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18494,8 +20799,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 875 VVV1_0( w_fp[90], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 875 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[874] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18514,8 +20822,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 876 FFV1_0( w_fp[48], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 876 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[875] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -18530,8 +20841,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 877 FFV1_0( w_fp[104], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 877 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[876] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18544,8 +20858,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 878 FFV1_0( w_fp[104], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 878 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[877] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] += amp_sv[0]; jamp_sv[58] -= amp_sv[0]; @@ -18560,8 +20877,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 879 FFV1_0( w_fp[48], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 879 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[878] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18574,8 +20894,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 880 VVV1_0( w_fp[0], w_fp[64], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 880 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[879] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18628,8 +20951,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 882 VVV1_0( w_fp[90], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 882 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[881] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18648,8 +20974,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 883 FFV1_0( w_fp[40], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 883 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[882] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -18664,8 +20993,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 884 FFV1_0( w_fp[62], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 884 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[883] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18678,8 +21010,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 885 FFV1_0( w_fp[62], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 885 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[884] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] += amp_sv[0]; jamp_sv[56] -= amp_sv[0]; @@ -18694,8 +21029,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 886 FFV1_0( w_fp[40], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 886 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[885] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18708,8 +21046,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 887 VVV1_0( w_fp[0], w_fp[63], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 887 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[886] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18762,8 +21103,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 889 FFV1_0( w_fp[3], w_fp[18], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 889 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[888] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18782,8 +21126,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 890 FFV1_0( w_fp[12], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 890 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[889] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18802,8 +21149,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 891 FFV1_0( w_fp[3], w_fp[93], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 891 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[890] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18822,8 +21172,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 892 FFV1_0( w_fp[65], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 892 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[891] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18842,8 +21195,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 893 FFV1_0( w_fp[12], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 893 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[892] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] += amp_sv[0]; jamp_sv[31] -= amp_sv[0]; @@ -18858,8 +21214,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 894 FFV1_0( w_fp[65], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 894 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[893] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] += amp_sv[0]; jamp_sv[91] -= amp_sv[0]; @@ -18874,8 +21233,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 895 VVV1_0( w_fp[65], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 895 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[894] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; jamp_sv[13] += amp_sv[0]; @@ -18902,8 +21264,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 896 VVV1_0( w_fp[65], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 896 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[895] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; jamp_sv[12] += amp_sv[0]; @@ -18988,8 +21353,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 898 VVV1_0( w_fp[56], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 898 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[897] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -19016,8 +21384,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 899 VVV1_0( w_fp[56], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 899 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[898] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -19102,8 +21473,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 901 VVV1_0( w_fp[0], w_fp[69], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 901 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[900] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -19130,8 +21504,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 902 VVV1_0( w_fp[0], w_fp[70], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 902 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[901] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -19454,8 +21831,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 908 VVV1_0( w_fp[8], w_fp[27], w_fp[65], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 908 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[907] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[3] -= amp_sv[0]; @@ -19482,8 +21862,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 909 VVV1_0( w_fp[66], w_fp[27], w_fp[56], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 909 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[908] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[3] -= amp_sv[0]; @@ -19510,8 +21893,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 910 VVV1_0( w_fp[66], w_fp[8], w_fp[101], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 910 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[909] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= amp_sv[0]; jamp_sv[11] += amp_sv[0]; @@ -19538,8 +21924,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 911 VVV1_0( w_fp[65], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 911 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[910] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19558,8 +21947,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 912 FFV1_0( w_fp[3], w_fp[36], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 912 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[911] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] += amp_sv[0]; jamp_sv[67] -= amp_sv[0]; @@ -19574,8 +21966,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 913 FFV1_0( w_fp[71], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 913 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[912] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19588,8 +21983,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 914 FFV1_0( w_fp[3], w_fp[114], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 914 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[913] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += amp_sv[0]; jamp_sv[50] -= amp_sv[0]; @@ -19604,8 +22002,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 915 FFV1_0( w_fp[71], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 915 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[914] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19618,8 +22019,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 916 VVV1_0( w_fp[0], w_fp[70], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 916 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[915] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; @@ -19672,8 +22076,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 918 FFV1_0( w_fp[41], w_fp[33], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 918 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[917] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += amp_sv[0]; jamp_sv[50] -= amp_sv[0]; @@ -19688,8 +22095,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 919 FFV1_0( w_fp[41], w_fp[114], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 919 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[918] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19702,8 +22112,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 920 FFV1_0( w_fp[62], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 920 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[919] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19716,8 +22129,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 921 VVV1_0( w_fp[65], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 921 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[920] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19736,8 +22152,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 922 FFV1_0( w_fp[3], w_fp[49], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 922 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[921] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[108] += amp_sv[0]; jamp_sv[109] -= amp_sv[0]; @@ -19752,8 +22171,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 923 FFV1_0( w_fp[71], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 923 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[922] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19766,8 +22188,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 924 FFV1_0( w_fp[3], w_fp[113], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 924 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[923] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] += amp_sv[0]; jamp_sv[98] -= amp_sv[0]; @@ -19782,8 +22207,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 925 FFV1_0( w_fp[71], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 925 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[924] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19796,8 +22224,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 926 VVV1_0( w_fp[0], w_fp[69], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 926 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[925] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; @@ -19850,8 +22281,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 928 FFV1_0( w_fp[46], w_fp[47], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 928 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[927] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] += amp_sv[0]; jamp_sv[100] -= amp_sv[0]; @@ -19866,8 +22300,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 929 FFV1_0( w_fp[46], w_fp[113], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 929 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[928] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19880,8 +22317,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 930 FFV1_0( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 930 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[929] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19894,8 +22334,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 931 VVV1_0( w_fp[65], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 931 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[930] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19914,8 +22357,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 932 FFV1_0( w_fp[53], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 932 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[931] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -19930,8 +22376,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 933 FFV1_0( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 933 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[932] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19944,8 +22393,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 934 FFV1_0( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 934 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[933] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] += amp_sv[0]; jamp_sv[82] -= amp_sv[0]; @@ -19960,8 +22412,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 935 FFV1_0( w_fp[53], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 935 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[934] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19974,8 +22429,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 936 VVV1_0( w_fp[0], w_fp[70], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 936 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[935] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; @@ -20028,8 +22486,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 938 VVV1_0( w_fp[65], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 938 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[937] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -20048,8 +22509,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 939 FFV1_0( w_fp[28], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 939 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[938] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[12] -= amp_sv[0]; @@ -20064,8 +22528,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 940 FFV1_0( w_fp[62], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 940 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[939] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -20078,8 +22545,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 941 FFV1_0( w_fp[62], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 941 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[940] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[38] += amp_sv[0]; jamp_sv[56] -= amp_sv[0]; @@ -20094,8 +22564,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 942 FFV1_0( w_fp[28], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 942 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[941] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -20108,8 +22581,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 943 VVV1_0( w_fp[0], w_fp[69], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 943 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[942] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; @@ -20162,8 +22638,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 945 FFV1_0( w_fp[3], w_fp[15], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 945 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[944] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; @@ -20182,8 +22661,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 946 FFV1_0( w_fp[14], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 946 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[945] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; @@ -20202,8 +22684,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 947 FFV1_0( w_fp[3], w_fp[94], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 947 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[946] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; @@ -20222,8 +22707,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 948 FFV1_0( w_fp[71], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 948 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[947] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; @@ -20242,8 +22730,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 949 FFV1_0( w_fp[14], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 949 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[948] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] += amp_sv[0]; jamp_sv[37] -= amp_sv[0]; @@ -20258,8 +22749,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 950 FFV1_0( w_fp[71], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 950 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[949] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] += amp_sv[0]; jamp_sv[67] -= amp_sv[0]; @@ -20274,8 +22768,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 951 VVV1_0( w_fp[71], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 951 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[950] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; jamp_sv[19] += amp_sv[0]; @@ -20302,8 +22799,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 952 VVV1_0( w_fp[71], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 952 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[951] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; jamp_sv[18] += amp_sv[0]; @@ -20388,8 +22888,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 954 VVV1_0( w_fp[56], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 954 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[953] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -20416,8 +22919,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 955 VVV1_0( w_fp[56], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 955 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[954] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -20502,8 +23008,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 957 VVV1_0( w_fp[0], w_fp[74], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 957 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[956] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -20530,8 +23039,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 958 VVV1_0( w_fp[0], w_fp[75], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 958 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[957] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -20852,8 +23364,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 964 VVV1_0( w_fp[8], w_fp[24], w_fp[71], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 964 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[963] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -20880,8 +23395,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 965 VVV1_0( w_fp[72], w_fp[24], w_fp[56], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 965 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[964] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -20908,8 +23426,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 966 VVV1_0( w_fp[72], w_fp[8], w_fp[98], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 966 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[965] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= amp_sv[0]; jamp_sv[9] += amp_sv[0]; @@ -20936,8 +23457,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 967 VVV1_0( w_fp[71], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 967 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[966] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -20956,8 +23480,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 968 FFV1_0( w_fp[3], w_fp[35], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 968 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[967] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] += amp_sv[0]; jamp_sv[61] -= amp_sv[0]; @@ -20972,8 +23499,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 969 FFV1_0( w_fp[76], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 969 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[968] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -20986,8 +23516,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 970 FFV1_0( w_fp[3], w_fp[114], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 970 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[969] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] += amp_sv[0]; jamp_sv[50] -= amp_sv[0]; @@ -21002,8 +23535,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 971 FFV1_0( w_fp[76], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 971 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[970] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21016,8 +23552,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 972 VVV1_0( w_fp[0], w_fp[75], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 972 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[971] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21070,8 +23609,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 974 FFV1_0( w_fp[38], w_fp[33], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 974 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[973] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] += amp_sv[0]; jamp_sv[52] -= amp_sv[0]; @@ -21086,8 +23628,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 975 FFV1_0( w_fp[38], w_fp[114], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 975 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[974] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21100,8 +23645,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 976 FFV1_0( w_fp[104], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 976 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[975] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21114,8 +23662,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 977 VVV1_0( w_fp[71], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 977 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[976] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21134,8 +23685,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 978 FFV1_0( w_fp[3], w_fp[43], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 978 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[977] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[84] += amp_sv[0]; jamp_sv[85] -= amp_sv[0]; @@ -21150,8 +23704,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 979 FFV1_0( w_fp[76], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 979 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[978] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21164,8 +23721,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 980 FFV1_0( w_fp[3], w_fp[102], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 980 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[979] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] += amp_sv[0]; jamp_sv[74] -= amp_sv[0]; @@ -21180,8 +23740,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 981 FFV1_0( w_fp[76], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 981 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[980] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21194,8 +23757,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 982 VVV1_0( w_fp[0], w_fp[74], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 982 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[981] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21248,8 +23814,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 984 FFV1_0( w_fp[46], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 984 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[983] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] += amp_sv[0]; jamp_sv[76] -= amp_sv[0]; @@ -21264,8 +23833,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 985 FFV1_0( w_fp[46], w_fp[102], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 985 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[984] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21278,8 +23850,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 986 FFV1_0( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 986 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[985] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21292,8 +23867,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 987 VVV1_0( w_fp[71], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 987 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[986] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21312,8 +23890,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 988 FFV1_0( w_fp[7], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 988 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[987] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += amp_sv[0]; jamp_sv[19] -= amp_sv[0]; @@ -21328,8 +23909,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 989 FFV1_0( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 989 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[988] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21342,8 +23926,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 990 FFV1_0( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 990 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[989] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[46] += amp_sv[0]; jamp_sv[82] -= amp_sv[0]; @@ -21358,8 +23945,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 991 FFV1_0( w_fp[7], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 991 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[990] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21372,8 +23962,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 992 VVV1_0( w_fp[0], w_fp[75], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 992 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[991] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21426,8 +24019,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 994 VVV1_0( w_fp[71], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 994 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[993] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21446,8 +24042,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 995 FFV1_0( w_fp[25], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 995 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[994] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[18] -= amp_sv[0]; @@ -21462,8 +24061,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 996 FFV1_0( w_fp[104], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 996 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[995] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21476,8 +24078,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 997 FFV1_0( w_fp[104], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 997 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[996] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[44] += amp_sv[0]; jamp_sv[58] -= amp_sv[0]; @@ -21492,8 +24097,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 998 FFV1_0( w_fp[25], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 998 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[997] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21506,8 +24114,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 999 VVV1_0( w_fp[0], w_fp[74], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 999 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[998] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21560,8 +24171,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1001 FFV1_0( w_fp[3], w_fp[17], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1001 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1000] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21580,8 +24194,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1002 FFV1_0( w_fp[26], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1002 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1001] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21600,8 +24217,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1003 FFV1_0( w_fp[3], w_fp[97], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1003 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1002] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21620,8 +24240,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1004 FFV1_0( w_fp[76], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1004 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1003] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21640,8 +24263,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1005 FFV1_0( w_fp[26], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1005 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1004] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] += amp_sv[0]; jamp_sv[43] -= amp_sv[0]; @@ -21656,8 +24282,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1006 FFV1_0( w_fp[76], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1006 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1005] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] += amp_sv[0]; jamp_sv[61] -= amp_sv[0]; @@ -21672,8 +24301,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1007 VVV1_0( w_fp[56], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1007 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1006] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -21700,8 +24332,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1008 VVV1_0( w_fp[56], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1008 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1007] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -21786,8 +24421,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1010 VVV1_0( w_fp[98], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1010 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1009] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -21814,8 +24452,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1011 VVV1_0( w_fp[98], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1011 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1010] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -21900,8 +24541,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1013 VVV1_0( w_fp[0], w_fp[108], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1013 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1012] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -21928,8 +24572,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1014 VVV1_0( w_fp[0], w_fp[59], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1014 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1013] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -22192,8 +24839,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1019 VVV1_0( w_fp[56], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1019 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1018] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[4] -= amp_sv[0]; @@ -22220,8 +24870,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1020 VVV1_0( w_fp[56], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1020 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1019] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -22306,8 +24959,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1022 VVV1_0( w_fp[101], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1022 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1021] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[21] -= amp_sv[0]; @@ -22334,8 +24990,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1023 VVV1_0( w_fp[101], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1023 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1022] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -22420,8 +25079,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1025 VVV1_0( w_fp[0], w_fp[108], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1025 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1024] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -22448,8 +25110,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1026 VVV1_0( w_fp[0], w_fp[68], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1026 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1025] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[4] -= amp_sv[0]; @@ -22710,8 +25375,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1031 VVV1_0( w_fp[56], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1031 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1030] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -22738,8 +25406,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1032 VVV1_0( w_fp[56], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1032 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1031] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -22824,8 +25495,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1034 VVV1_0( w_fp[96], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1034 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1033] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += amp_sv[0]; jamp_sv[23] -= amp_sv[0]; @@ -22852,8 +25526,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1035 VVV1_0( w_fp[96], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1035 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1034] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[22] -= amp_sv[0]; @@ -22938,8 +25615,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1037 VVV1_0( w_fp[0], w_fp[108], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1037 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1036] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -22966,8 +25646,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1038 VVV1_0( w_fp[0], w_fp[67], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1038 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1037] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -23504,8 +26187,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1046 FFV1_0( w_fp[58], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1046 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1045] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[52] -= amp_sv[0]; @@ -23517,8 +26203,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1047 FFV1_0( w_fp[48], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1047 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1046] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] -= amp_sv[0]; @@ -23530,8 +26219,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1048 FFV1_0( w_fp[104], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1048 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1047] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[58] -= amp_sv[0]; @@ -23543,8 +26235,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1049 FFV1_0( w_fp[104], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1049 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1048] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[68] -= amp_sv[0]; @@ -23556,8 +26251,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1050 FFV1_0( w_fp[48], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1050 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1049] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[55] -= amp_sv[0]; @@ -23569,8 +26267,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1051 FFV1_0( w_fp[58], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1051 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1050] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] -= amp_sv[0]; @@ -23582,8 +26283,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1052 FFV1_0( w_fp[60], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1052 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1051] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[50] -= amp_sv[0]; @@ -23595,8 +26299,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1053 FFV1_0( w_fp[40], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1053 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1052] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] -= amp_sv[0]; @@ -23608,8 +26315,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1054 FFV1_0( w_fp[62], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1054 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1053] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[56] -= amp_sv[0]; @@ -23621,8 +26331,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1055 FFV1_0( w_fp[62], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1055 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1054] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[62] -= amp_sv[0]; @@ -23634,8 +26347,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1056 FFV1_0( w_fp[40], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1056 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1055] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[54] -= amp_sv[0]; @@ -23647,8 +26363,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1057 FFV1_0( w_fp[60], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1057 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1056] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] -= amp_sv[0]; @@ -23660,8 +26379,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1058 FFV1_0( w_fp[3], w_fp[114], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1058 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1057] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += amp_sv[0]; jamp_sv[49] -= amp_sv[0]; @@ -23676,8 +26398,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1059 FFV1_0( w_fp[12], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1059 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1058] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -23690,8 +26415,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1060 FFV1_0( w_fp[3], w_fp[100], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1060 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1059] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[54] += amp_sv[0]; jamp_sv[55] -= amp_sv[0]; @@ -23706,8 +26434,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1061 VVV1_0( w_fp[96], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1061 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1060] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; @@ -23726,8 +26457,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1062 FFV1_0( w_fp[12], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1062 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1061] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -23740,8 +26474,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1063 VVV1_0( w_fp[0], w_fp[67], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1063 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1062] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; @@ -23794,8 +26531,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1065 FFV1_0( w_fp[78], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1065 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1064] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[76] -= amp_sv[0]; @@ -23807,8 +26547,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1066 FFV1_0( w_fp[53], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1066 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1065] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] -= amp_sv[0]; @@ -23820,8 +26563,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1067 FFV1_0( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1067 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1066] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[82] -= amp_sv[0]; @@ -23833,8 +26579,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1068 FFV1_0( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1068 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1067] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[92] -= amp_sv[0]; @@ -23846,8 +26595,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1069 FFV1_0( w_fp[53], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1069 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1068] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[79] -= amp_sv[0]; @@ -23859,8 +26611,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1070 FFV1_0( w_fp[78], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1070 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1069] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] -= amp_sv[0]; @@ -23872,8 +26627,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1071 FFV1_0( w_fp[60], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1071 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1070] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[74] -= amp_sv[0]; @@ -23885,8 +26643,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1072 FFV1_0( w_fp[28], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1072 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1071] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] -= amp_sv[0]; @@ -23898,8 +26659,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1073 FFV1_0( w_fp[62], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1073 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1072] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[80] -= amp_sv[0]; @@ -23911,8 +26675,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1074 FFV1_0( w_fp[62], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1074 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1073] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[86] -= amp_sv[0]; @@ -23924,8 +26691,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1075 FFV1_0( w_fp[28], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1075 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1074] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[78] -= amp_sv[0]; @@ -23937,8 +26707,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1076 FFV1_0( w_fp[60], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1076 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1075] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[84] -= amp_sv[0]; @@ -23950,8 +26723,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1077 FFV1_0( w_fp[3], w_fp[102], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1077 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1076] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += amp_sv[0]; jamp_sv[73] -= amp_sv[0]; @@ -23966,8 +26742,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1078 FFV1_0( w_fp[14], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1078 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1077] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -23980,8 +26759,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1079 FFV1_0( w_fp[3], w_fp[89], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1079 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1078] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[78] += amp_sv[0]; jamp_sv[79] -= amp_sv[0]; @@ -23996,8 +26778,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1080 VVV1_0( w_fp[101], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1080 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1079] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24016,8 +26801,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1081 FFV1_0( w_fp[14], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1081 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1080] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24030,8 +26818,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1082 VVV1_0( w_fp[0], w_fp[68], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1082 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1081] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24084,8 +26875,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1084 FFV1_0( w_fp[78], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1084 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1083] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[100] -= amp_sv[0]; @@ -24097,8 +26891,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1085 FFV1_0( w_fp[7], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1085 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1084] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] -= amp_sv[0]; @@ -24110,8 +26907,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1086 FFV1_0( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1086 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1085] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[106] -= amp_sv[0]; @@ -24123,8 +26923,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1087 FFV1_0( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1087 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1086] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[116] -= amp_sv[0]; @@ -24136,8 +26939,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1088 FFV1_0( w_fp[7], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1088 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1087] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[103] -= amp_sv[0]; @@ -24149,8 +26955,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1089 FFV1_0( w_fp[78], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1089 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1088] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[114] -= amp_sv[0]; @@ -24162,8 +26971,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1090 FFV1_0( w_fp[58], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1090 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1089] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[98] -= amp_sv[0]; @@ -24175,8 +26987,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1091 FFV1_0( w_fp[25], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1091 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1090] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] -= amp_sv[0]; @@ -24188,8 +27003,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1092 FFV1_0( w_fp[104], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1092 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1091] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[104] -= amp_sv[0]; @@ -24201,8 +27019,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1093 FFV1_0( w_fp[104], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1093 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1092] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[110] -= amp_sv[0]; @@ -24214,8 +27035,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1094 FFV1_0( w_fp[25], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1094 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1093] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[102] -= amp_sv[0]; @@ -24227,8 +27051,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1095 FFV1_0( w_fp[58], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1095 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1094] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[108] -= amp_sv[0]; @@ -24240,8 +27067,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1096 FFV1_0( w_fp[3], w_fp[113], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1096 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1095] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += amp_sv[0]; jamp_sv[97] -= amp_sv[0]; @@ -24256,8 +27086,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1097 FFV1_0( w_fp[26], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1097 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1096] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24270,8 +27103,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1098 FFV1_0( w_fp[3], w_fp[91], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1098 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1097] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[102] += amp_sv[0]; jamp_sv[103] -= amp_sv[0]; @@ -24286,8 +27122,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1099 VVV1_0( w_fp[98], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1099 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1098] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24306,8 +27145,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1100 FFV1_0( w_fp[26], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1100 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1099] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24320,8 +27162,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1101 VVV1_0( w_fp[0], w_fp[59], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1101 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1100] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24374,8 +27219,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1103 FFV1_0( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1103 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1102] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] += amp_sv[0]; jamp_sv[46] -= amp_sv[0]; @@ -24390,8 +27238,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1104 FFV1_0( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1104 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1103] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24404,8 +27255,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1105 FFV1_0( w_fp[78], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1105 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1104] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[22] -= amp_sv[0]; @@ -24420,8 +27274,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1106 VVV1_0( w_fp[96], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1106 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1105] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24440,8 +27297,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1107 FFV1_0( w_fp[78], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1107 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1106] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24454,8 +27314,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1108 VVV1_0( w_fp[0], w_fp[67], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1108 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1107] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24508,8 +27371,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1110 FFV1_0( w_fp[104], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1110 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1109] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] += amp_sv[0]; jamp_sv[44] -= amp_sv[0]; @@ -24524,8 +27390,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1111 FFV1_0( w_fp[104], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1111 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1110] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24538,8 +27407,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1112 FFV1_0( w_fp[58], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1112 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1111] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -24554,8 +27426,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1113 VVV1_0( w_fp[101], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1113 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1112] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24574,8 +27449,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1114 FFV1_0( w_fp[58], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1114 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1113] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24588,8 +27466,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1115 VVV1_0( w_fp[0], w_fp[68], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1115 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1114] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24642,8 +27523,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1117 FFV1_0( w_fp[62], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1117 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1116] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] += amp_sv[0]; jamp_sv[38] -= amp_sv[0]; @@ -24658,8 +27542,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1118 FFV1_0( w_fp[62], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1118 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1117] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24672,8 +27559,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1119 FFV1_0( w_fp[60], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1119 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1118] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -24688,8 +27578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1120 VVV1_0( w_fp[98], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1120 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1119] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24708,8 +27601,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1121 FFV1_0( w_fp[60], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1121 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1120] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24722,8 +27618,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1122 VVV1_0( w_fp[0], w_fp[59], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1122 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1121] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; @@ -30778,7 +33677,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -30808,11 +33706,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -30887,38 +33786,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -31097,11 +33965,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 4f7b5172f1..2832528673 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004235267639160156  +DEBUG: model prefixing takes 0.0018684864044189453  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,13 +151,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.490 s +1 processes with 1240 diagrams generated in 0.701 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 @@ -165,18 +166,18 @@ INFO: Processing color information for process: g g > t t~ g g g @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 5.122 s +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. +Generated helas calls for 1 subprocesses (1240 diagrams) in 2.523 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.290 s +ALOHA: aloha creates 5 routines in 0.138 s VVV1 VVV1 FFV1 @@ -189,17 +190,17 @@ ALOHA: aloha creates 5 routines in 0.290 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m10.012s -user 0m9.867s -sys 0m0.109s -Code generation completed in 10 seconds +real 0m8.591s +user 0m5.091s +sys 0m0.309s +Code generation completed in 9 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index 2f17add993..cc4e37eaa9 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -32668,7 +32732,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -32698,11 +32761,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -32777,38 +32841,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -32987,11 +33020,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 71b7095c67..2a38116d7d 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004422187805175781  +DEBUG: model prefixing takes 0.001802682876586914  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -165,7 +166,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.058 s +8 processes with 40 diagrams generated in 0.043 s Total: 8 processes with 40 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -176,10 +177,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -199,9 +200,9 @@ FileWriter t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -210,50 +211,52 @@ FileWriter t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  -Generated helas calls for 2 subprocesses (10 diagrams) in 0.026 s -Wrote files for 32 helas calls in 0.131 s +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1589]  +Generated helas calls for 2 subprocesses (10 diagrams) in 0.025 s +Wrote files for 32 helas calls in 2.532 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.106 s +ALOHA: aloha creates 2 routines in 0.094 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.094 s +ALOHA: aloha creates 4 routines in 0.079 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m2.314s -user 0m1.828s -sys 0m0.404s -Code generation completed in 2 seconds +real 0m7.934s +user 0m1.348s +sys 0m0.746s +Code generation completed in 8 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -274,9 +277,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -303,9 +306,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gq_ttq.mad/Source/make_opts b/epochX/cudacpp/gq_ttq.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/make_opts +++ b/epochX/cudacpp/gq_ttq.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 7d4745918b..35d6d0610c 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -360,7 +424,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -374,7 +438,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -397,8 +461,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -412,8 +479,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -426,8 +496,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -440,8 +513,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[1], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -454,8 +530,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1117,7 +1196,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1147,11 +1225,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1226,38 +1305,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1436,11 +1484,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 414284b61a..cc683b55c1 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -360,7 +424,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -374,7 +438,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -397,8 +461,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -412,8 +479,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -426,8 +496,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 6. * amp_sv[0]; @@ -440,8 +513,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[4], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -454,8 +530,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1117,7 +1196,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1147,11 +1225,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1226,38 +1305,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1436,11 +1484,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index d16040de18..f3f83367a3 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004274129867553711  +DEBUG: model prefixing takes 0.0017499923706054688  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -165,13 +166,13 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.059 s +8 processes with 40 diagrams generated in 0.043 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -188,40 +189,40 @@ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  DEBUG: type(subproc_group)= [output.py at line 223]  DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=1 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.023 s +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. +Generated helas calls for 2 subprocesses (10 diagrams) in 0.026 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.105 s +ALOHA: aloha creates 2 routines in 0.062 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.535s -user 0m0.481s -sys 0m0.048s -Code generation completed in 1 seconds +real 0m1.860s +user 0m0.420s +sys 0m0.146s +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc index e57428e73e..1236fdcfcc 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -360,7 +424,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -374,7 +438,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -1112,7 +1176,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1142,11 +1205,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1221,38 +1285,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1431,11 +1464,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc index 57dd4fee2d..d4db88aa57 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -360,7 +424,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -374,7 +438,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -1112,7 +1176,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1142,11 +1205,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1221,38 +1285,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1431,11 +1464,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt index faef5b2d67..fb6910fcf6 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,14 +49,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  @@ -133,10 +134,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --ve INFO: initialize a new directory: CODEGEN_mad_heft_gg_bb INFO: remove old information in CODEGEN_mad_heft_gg_bb DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 @@ -148,55 +149,57 @@ FileWriter b b~ HIG<=1 HIW<=1 @1 INFO: Finding symmetric diagrams for subprocess group gg_bbx -DEBUG: len(subproc_diagrams_for_config) =  4 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (4 diagrams) in 0.008 s -Wrote files for 12 helas calls in 0.062 s +DEBUG: len(subproc_diagrams_for_config) =  4 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (4 diagrams) in 0.005 s +Wrote files for 12 helas calls in 1.556 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.193 s +ALOHA: aloha creates 4 routines in 0.120 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 8 routines in 0.178 s +ALOHA: aloha creates 8 routines in 0.112 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README Run "open index.html" to see more information about this process. quit -real 0m2.118s -user 0m1.750s -sys 0m0.364s -Code generation completed in 2 seconds +real 0m6.775s +user 0m1.308s +sys 0m0.633s +Code generation completed in 7 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -217,9 +220,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -246,9 +249,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts b/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h index 7d7b960511..5e318bc0a4 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_heft.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc index fbb0c2effb..282f710a83 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -391,8 +455,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFS2_0( w_fp[3], w_fp[2], w_fp[4], COUPs[ndcoup + 0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 2. * amp_sv[0]; @@ -404,8 +471,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -418,8 +488,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -431,8 +504,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -1076,7 +1152,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1106,11 +1181,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1185,38 +1261,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1395,11 +1440,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt index 5208ed190c..19767c3f2d 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,63 +49,15 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft -INFO: download model from http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz to the following directory: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/models  ---2025-10-22 11:47:55-- http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz -Resolving madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)... 130.104.1.243 -Connecting to madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)|130.104.1.243|:80... connected. -HTTP request sent, awaiting response... 200 OK -Length: 50876 (50K) [application/x-gzip] -Saving to: ‘tmp.tgz’ - - 0K .......... .......... .......... .......... ......... 100% 921K=0.05s - -2025-10-22 11:47:55 (921 KB/s) - ‘tmp.tgz’ saved [50876/50876] - -heft/ -heft/write_param_card.py -heft/restrict_ckm.dat -heft/couplings.py -heft/HEFT_UFO.log -heft/lorentz.py -heft/__init__.py -heft/__pycache__/ -heft/particles.py -heft/object_library.py -heft/restrict_default.dat -heft/restrict_zeromass_ckm.dat -heft/restrict_no_b_mass.dat -heft/function_library.py -heft/parameters.py -heft/py3_model.pkl -heft/coupling_orders.py -heft/restrict_no_tau_mass.dat -heft/vertices.py -heft/restrict_no_masses.dat -heft/__pycache__/write_param_card.cpython-311.pyc -heft/__pycache__/parameters.cpython-311.pyc -heft/__pycache__/function_library.cpython-311.pyc -heft/__pycache__/coupling_orders.cpython-311.pyc -heft/__pycache__/object_library.cpython-311.pyc -heft/__pycache__/couplings.cpython-311.pyc -heft/__pycache__/particles.cpython-311.pyc -heft/__pycache__/vertices.cpython-311.pyc -heft/__pycache__/lorentz.cpython-311.pyc -heft/__pycache__/__init__.cpython-311.pyc -INFO: reload from .py file -INFO: load particles -INFO: load vertices -WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -DEBUG: model prefixing takes 0.004904985427856445  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -170,13 +123,13 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ generate g g > b b~ HIW<=1 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Process has 4 diagrams -1 processes with 4 diagrams generated in 0.004 s +1 processes with 4 diagrams generated in 0.006 s Total: 1 processes with 4 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_bb Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 @@ -185,34 +138,34 @@ INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. -Generated helas calls for 1 subprocesses (4 diagrams) in 0.007 s +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. +Generated helas calls for 1 subprocesses (4 diagrams) in 0.010 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.185 s +ALOHA: aloha creates 4 routines in 0.127 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. quit -real 0m0.821s -user 0m0.568s -sys 0m0.084s +real 0m1.577s +user 0m0.495s +sys 0m0.136s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h index 7d7b960511..5e318bc0a4 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_heft.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc index 8fc4cf7184..7ae2e2ed53 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -1072,7 +1136,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1102,11 +1165,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1181,38 +1245,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1391,11 +1424,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt index b5ca9e6bb6..912b9ddf90 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F import model sm-no_b_mass INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004863262176513672  +DEBUG: model prefixing takes 0.0017840862274169922  INFO: Restrict model sm-no_b_mass with file models/sm/restrict_no_b_mass.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -180,7 +181,7 @@ INFO: Process u~ d > t t~ w- added to mirror process d u~ > t t~ w- INFO: Process c~ s > t t~ w- added to mirror process s c~ > t t~ w- INFO: Process d~ u > t t~ w+ added to mirror process u d~ > t t~ w+ INFO: Process s~ c > t t~ w+ added to mirror process c s~ > t t~ w+ -4 processes with 8 diagrams generated in 0.093 s +4 processes with 8 diagrams generated in 0.081 s Total: 4 processes with 8 diagrams add process p p > t t~ w j @1 INFO: Checking for minimal orders which gives processes. @@ -222,7 +223,7 @@ INFO: Process d~ g > t t~ w+ u~ added to mirror process g d~ > t t~ w+ u~ INFO: Process d~ u > t t~ w+ g added to mirror process u d~ > t t~ w+ g INFO: Process s~ g > t t~ w+ c~ added to mirror process g s~ > t t~ w+ c~ INFO: Process s~ c > t t~ w+ g added to mirror process c s~ > t t~ w+ g -12 processes with 144 diagrams generated in 0.520 s +12 processes with 144 diagrams generated in 0.265 s Total: 16 processes with 152 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -233,10 +234,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --v INFO: initialize a new directory: CODEGEN_mad_nobm_pp_ttW INFO: remove old information in CODEGEN_mad_nobm_pp_ttW DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ w+ d WEIGHTED<=5 @1 INFO: Processing color information for process: g u > t t~ w+ d @1 @@ -270,9 +271,9 @@ FileWriter t t~ w+ d WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxwpd -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gd_ttxwmu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -281,9 +282,9 @@ FileWriter t t~ w- u WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gd_ttxwmu -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gux_ttxwmdx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -292,9 +293,9 @@ FileWriter t t~ w- d~ WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxwmdx -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gdx_ttxwpux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -303,9 +304,9 @@ FileWriter t t~ w+ u~ WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gdx_ttxwpux -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1589]  INFO: Creating files in directory P1_udx_ttxwpg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -314,9 +315,9 @@ FileWriter t t~ w+ g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group udx_ttxwpg -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1589]  INFO: Creating files in directory P1_dux_ttxwmg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -325,9 +326,9 @@ FileWriter t t~ w- g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group dux_ttxwmg -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1589]  INFO: Creating files in directory P0_udx_ttxwp DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -336,9 +337,9 @@ FileWriter t t~ w+ WEIGHTED<=4 INFO: Finding symmetric diagrams for subprocess group udx_ttxwp -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1589]  INFO: Creating files in directory P0_dux_ttxwm DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -347,21 +348,21 @@ FileWriter t t~ w- WEIGHTED<=4 INFO: Finding symmetric diagrams for subprocess group dux_ttxwm -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1577]  -Generated helas calls for 8 subprocesses (76 diagrams) in 0.172 s -Wrote files for 212 helas calls in 0.856 s +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1589]  +Generated helas calls for 8 subprocesses (76 diagrams) in 0.101 s +Wrote files for 212 helas calls in 11.333 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 3 routines in 0.166 s +ALOHA: aloha creates 3 routines in 0.099 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 6 routines in 0.150 s +ALOHA: aloha creates 6 routines in 0.101 s FFV1 FFV1 FFV1 @@ -369,32 +370,34 @@ ALOHA: aloha creates 6 routines in 0.150 s FFV2 FFV2 VVV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h -INFO: Created file HelAmps_sm_no_b_mass.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h +INFO: Created file HelAmps_sm_no_b_mass.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc INFO: Created files Parameters_sm_no_b_mass.h and Parameters_sm_no_b_mass.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README Run "open index.html" to see more information about this process. quit -real 0m4.809s -user 0m4.082s -sys 0m0.695s -Code generation completed in 5 seconds +real 0m18.260s +user 0m2.773s +sys 0m1.309s +Code generation completed in 19 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -415,9 +418,9 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -444,9 +447,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts b/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h index 71a4c3f155..058c89b2f9 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm_no_b_mass.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc index 0893180611..b04a53b56a 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -358,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -372,7 +436,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -395,8 +459,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -409,8 +476,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -1089,7 +1159,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1119,11 +1188,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1198,38 +1268,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1408,11 +1447,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc index 2a56cf5ec4..a9ff387906 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -358,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -372,7 +436,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -395,8 +459,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -409,8 +476,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -1089,7 +1159,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1119,11 +1188,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1198,38 +1268,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1408,11 +1447,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc index 6e87d2186e..506791a8b1 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -358,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -372,7 +436,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -398,8 +462,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -412,8 +479,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -428,8 +498,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -442,8 +515,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -457,8 +533,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -471,8 +550,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -485,8 +567,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -499,8 +584,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -513,8 +601,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -527,8 +618,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -541,8 +635,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -555,8 +652,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -1285,7 +1385,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1315,11 +1414,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1394,38 +1494,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1604,11 +1673,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc index 563e3c6ead..2bccd80866 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -358,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -372,7 +436,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -398,8 +462,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -412,8 +479,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -428,8 +498,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -442,8 +515,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -457,8 +533,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -471,8 +550,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -485,8 +567,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -499,8 +584,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -513,8 +601,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -527,8 +618,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -541,8 +635,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -555,8 +652,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -1285,7 +1385,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1315,11 +1414,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1394,38 +1494,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1604,11 +1673,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc index d6a510d40d..1550234c05 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -358,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -372,7 +436,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -398,8 +462,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -412,8 +479,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -428,8 +498,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -442,8 +515,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -457,8 +533,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 6. * amp_sv[0]; @@ -471,8 +550,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 6. * amp_sv[0]; @@ -485,8 +567,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -499,8 +584,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -513,8 +601,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -527,8 +618,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -541,8 +635,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -555,8 +652,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -1285,7 +1385,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1315,11 +1414,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1394,38 +1494,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1604,11 +1673,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc index 75dc9427fe..e3f2d0c976 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -358,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -372,7 +436,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -398,8 +462,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -412,8 +479,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -428,8 +498,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -442,8 +515,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -457,8 +533,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -471,8 +550,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -485,8 +567,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -499,8 +584,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -513,8 +601,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -527,8 +618,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -541,8 +635,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -555,8 +652,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -1285,7 +1385,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1315,11 +1414,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1394,38 +1494,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1604,11 +1673,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc index 27c3656e02..fd9d2b525b 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -358,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -372,7 +436,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -398,8 +462,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -412,8 +479,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -428,8 +498,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -442,8 +515,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -457,8 +533,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 6. * amp_sv[0]; @@ -471,8 +550,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 6. * amp_sv[0]; @@ -485,8 +567,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -499,8 +584,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -513,8 +601,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -527,8 +618,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -541,8 +635,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -555,8 +652,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -1285,7 +1385,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1315,11 +1414,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1394,38 +1494,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1604,11 +1673,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc index 31f01d963a..61ace6e710 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -358,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -372,7 +436,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -398,8 +462,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -412,8 +479,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -428,8 +498,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -442,8 +515,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -457,8 +533,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -471,8 +550,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -485,8 +567,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -499,8 +584,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -513,8 +601,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -527,8 +618,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -541,8 +635,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -555,8 +652,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -1285,7 +1385,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1315,11 +1414,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1394,38 +1494,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1604,11 +1673,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index 0da34a0aa2..864a458f2c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0046498775482177734  +DEBUG: model prefixing takes 0.001878499984741211  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -167,7 +168,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.025 s +5 processes with 7 diagrams generated in 0.039 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -207,7 +208,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.114 s +13 processes with 76 diagrams generated in 0.082 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -373,7 +374,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.872 s +65 processes with 1119 diagrams generated in 0.814 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -384,10 +385,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vec INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Processing color information for process: g g > t t~ g g @2 @@ -498,9 +499,9 @@ FileWriter t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1589]  INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -509,9 +510,9 @@ FileWriter t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1589]  INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -520,9 +521,9 @@ FileWriter t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1589]  INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -531,9 +532,9 @@ FileWriter t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1589]  INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -542,9 +543,9 @@ FileWriter t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -553,9 +554,9 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1589]  INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -564,9 +565,9 @@ FileWriter t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1589]  INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -575,9 +576,9 @@ FileWriter t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1589]  INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -586,9 +587,9 @@ FileWriter t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1589]  INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -597,9 +598,9 @@ FileWriter t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1589]  INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -608,9 +609,9 @@ FileWriter t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1589]  INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -619,9 +620,9 @@ FileWriter t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1589]  INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -630,9 +631,9 @@ FileWriter t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -641,9 +642,9 @@ FileWriter t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -652,9 +653,9 @@ FileWriter t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1589]  INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -663,9 +664,9 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1589]  INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -674,9 +675,9 @@ FileWriter t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1589]  INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -685,25 +686,25 @@ FileWriter t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1577]  -Generated helas calls for 18 subprocesses (372 diagrams) in 1.392 s -Wrote files for 810 helas calls in 2.303 s +DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1589]  +Generated helas calls for 18 subprocesses (372 diagrams) in 0.593 s +Wrote files for 810 helas calls in 25.690 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.281 s +ALOHA: aloha creates 5 routines in 0.178 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.237 s +ALOHA: aloha creates 10 routines in 0.136 s VVV1 VVV1 FFV1 @@ -716,32 +717,34 @@ ALOHA: aloha creates 10 routines in 0.237 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m10.952s -user 0m9.707s -sys 0m1.156s -Code generation completed in 11 seconds +real 0m36.427s +user 0m5.481s +sys 0m2.328s +Code generation completed in 38 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -762,9 +765,9 @@ Code generation completed in 11 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -791,9 +794,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts +++ b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc index 2ffa35504b..aa1147423f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -391,8 +455,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -405,8 +472,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -418,8 +488,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -1062,7 +1135,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1092,11 +1164,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1171,38 +1244,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1381,11 +1423,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc index b1adf10a8d..ed592e4e1a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -360,7 +424,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -374,7 +438,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -394,8 +458,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -1039,7 +1106,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1069,11 +1135,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1148,38 +1215,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1358,11 +1394,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 7f29af7755..c2f3ee7141 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -394,8 +458,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; jamp_sv[2] += amp_sv[0]; @@ -410,8 +477,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -424,8 +494,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -439,8 +512,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -452,8 +528,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -466,8 +545,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -480,8 +562,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; @@ -493,8 +578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -507,8 +595,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; @@ -520,8 +611,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -534,8 +628,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -548,8 +645,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -564,8 +664,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; @@ -577,8 +680,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; @@ -590,8 +696,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -1279,7 +1388,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1309,11 +1417,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1388,38 +1497,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1598,11 +1676,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index a15b72b642..60b69e61d0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -360,7 +424,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -374,7 +438,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -397,8 +461,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[4], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -412,8 +479,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -426,8 +496,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -440,8 +513,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[1], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -454,8 +530,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1117,7 +1196,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1147,11 +1225,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1226,38 +1305,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1436,11 +1484,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 2cac6d6802..d1a34b8ade 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -360,7 +424,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -374,7 +438,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -397,8 +461,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -412,8 +479,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -426,8 +496,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 6. * amp_sv[0]; @@ -440,8 +513,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[4], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -454,8 +530,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1117,7 +1196,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1147,11 +1225,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1226,38 +1305,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1436,11 +1484,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc index 75c172df70..ec76c63604 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -360,7 +424,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -374,7 +438,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -397,8 +461,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -412,8 +479,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -426,8 +496,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -440,8 +513,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[0], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -454,8 +530,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[4], w_fp[7], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1117,7 +1196,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1147,11 +1225,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1226,38 +1305,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1436,11 +1484,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc index 382d6f340c..c529cd5dd7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -430,8 +494,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 VVV1_0( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -450,8 +517,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -470,8 +540,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 VVV1_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -491,8 +564,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -505,8 +581,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -521,8 +600,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -535,8 +617,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -549,8 +634,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -565,8 +653,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -579,8 +670,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -593,8 +687,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -609,8 +706,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -623,8 +723,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -639,8 +742,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -655,8 +761,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[17] -= amp_sv[0]; @@ -673,8 +782,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[16] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; @@ -686,8 +798,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; @@ -699,8 +814,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -714,8 +832,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -730,8 +851,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[20] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -744,8 +868,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -758,8 +885,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 VVV1_0( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -774,8 +904,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -788,8 +921,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 FFV1_0( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[24] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -802,8 +938,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -815,8 +954,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; @@ -828,8 +970,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -841,8 +986,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; @@ -854,8 +1002,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -868,8 +1019,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 VVV1_0( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -910,8 +1064,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] -= amp_sv[0]; @@ -923,8 +1080,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 34 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[33] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] -= amp_sv[0]; @@ -936,8 +1096,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 FFV1_0( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -950,8 +1113,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -966,8 +1132,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 37 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[36] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -980,8 +1149,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 38 FFV1_0( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[37] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -994,8 +1166,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 39 VVV1_0( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[38] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -1010,8 +1185,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 40 FFV1_0( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[39] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1024,8 +1202,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 41 FFV1_0( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[40] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1038,8 +1219,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 42 FFV1_0( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[41] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] -= amp_sv[0]; @@ -1051,8 +1235,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 43 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[42] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] -= amp_sv[0]; @@ -1064,8 +1251,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 44 FFV1_0( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[43] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[23] -= amp_sv[0]; @@ -1077,8 +1267,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 45 FFV1_0( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[44] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] -= amp_sv[0]; @@ -1090,8 +1283,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 46 FFV1_0( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[45] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1104,8 +1300,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 47 VVV1_0( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[46] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -1143,8 +1342,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 49 FFV1_0( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[48] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1157,8 +1359,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 50 FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[49] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -1173,8 +1378,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 51 FFV1_0( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[50] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1187,8 +1395,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 52 FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[51] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1201,8 +1412,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 53 FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[52] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -1217,8 +1431,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 54 FFV1_0( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[53] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1231,8 +1448,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 55 FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[54] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[3] -= amp_sv[0]; @@ -1247,8 +1467,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 56 FFV1_0( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[55] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -1263,8 +1486,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 57 VVV1_0( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[56] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1317,8 +1543,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 59 VVV1_0( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[58] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1337,8 +1566,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 60 VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[59] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1357,8 +1589,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 61 FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[60] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -1373,8 +1608,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 62 FFV1_0( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[61] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1387,8 +1625,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 63 FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[62] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -1403,8 +1644,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 64 FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[63] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1418,8 +1662,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 65 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[64] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1432,8 +1679,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 66 FFV1_0( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[65] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -1448,8 +1698,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 67 FFV1_0( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[66] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1462,8 +1715,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 68 FFV1_0( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[67] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1476,8 +1732,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 69 FFV1_0( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[68] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -1492,8 +1751,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 70 FFV1_0( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[69] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1506,8 +1768,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 71 FFV1_0( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[70] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -1522,8 +1787,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 72 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[71] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[9] -= amp_sv[0]; @@ -1538,8 +1806,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 73 VVV1_0( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[72] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1592,8 +1863,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 75 VVV1_0( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[74] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1612,8 +1886,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 76 VVV1_0( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[75] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1632,8 +1909,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 77 FFV1_0( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[76] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -1648,8 +1928,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 78 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[77] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1662,8 +1945,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 79 FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[78] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -1678,8 +1964,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 80 FFV1_0( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 80 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[79] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1692,8 +1981,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 81 FFV1_0( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[80] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= amp_sv[0]; @@ -1705,8 +1997,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 82 FFV1_0( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[81] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= amp_sv[0]; @@ -1718,8 +2013,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 83 FFV1_0( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[82] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= amp_sv[0]; @@ -1731,8 +2029,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 84 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[83] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= amp_sv[0]; @@ -1744,8 +2045,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 85 FFV1_0( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[84] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1758,8 +2062,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 86 FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[85] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -1774,8 +2081,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 87 FFV1_0( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[86] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= amp_sv[0]; @@ -1787,8 +2097,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 88 FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[87] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] -= amp_sv[0]; @@ -1800,8 +2113,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 89 FFV1_0( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[88] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] -= amp_sv[0]; @@ -1813,8 +2129,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 90 FFV1_0( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 90 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[89] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] -= amp_sv[0]; @@ -1826,8 +2145,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 91 FFV1_0( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[90] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1840,8 +2162,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 92 FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[91] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -1890,8 +2215,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 94 VVV1_0( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[93] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1910,8 +2238,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 95 VVV1_0( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[94] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1930,8 +2261,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 96 FFV1_0( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[95] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[19] -= amp_sv[0]; @@ -1946,8 +2280,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 97 FFV1_0( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[96] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1960,8 +2297,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 98 FFV1_0( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[97] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -1976,8 +2316,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 99 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[98] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2024,8 +2367,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 101 VVV1_0( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[100] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2044,8 +2390,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 102 VVV1_0( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[101] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2064,8 +2413,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 103 FFV1_0( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[102] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -2080,8 +2432,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 104 FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[103] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2094,8 +2449,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 105 FFV1_0( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[104] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[4] -= amp_sv[0]; @@ -2110,8 +2468,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 106 FFV1_0( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[105] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2158,8 +2519,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 108 VVV1_0( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[107] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2178,8 +2542,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 109 VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[108] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2198,8 +2565,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 110 FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[109] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= amp_sv[0]; @@ -2211,8 +2581,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 111 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[110] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] -= amp_sv[0]; @@ -2224,8 +2597,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 112 FFV1_0( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[111] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= amp_sv[0]; @@ -2237,8 +2613,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 113 FFV1_0( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[112] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] -= amp_sv[0]; @@ -3207,7 +3586,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -3237,11 +3615,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -3316,38 +3695,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -3526,11 +3874,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc index e5370edc0d..b5ba7190b8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -360,7 +424,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -374,7 +438,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +464,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -416,8 +483,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -432,8 +502,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -448,8 +521,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -464,8 +540,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -481,8 +560,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * amp_sv[0]; jamp_sv[5] += 1. / 2. * amp_sv[0]; @@ -496,8 +578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 6. * amp_sv[0]; jamp_sv[5] += 1. / 2. * amp_sv[0]; @@ -510,8 +595,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[4] -= 1. / 6. * amp_sv[0]; @@ -524,8 +612,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * amp_sv[0]; jamp_sv[1] += 1. / 2. * amp_sv[0]; @@ -538,8 +629,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 VVV1_0( w_fp[1], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -553,8 +647,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 2. * amp_sv[0]; jamp_sv[8] -= 1. / 6. * amp_sv[0]; @@ -567,8 +664,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 6. * amp_sv[0]; jamp_sv[10] += 1. / 2. * amp_sv[0]; @@ -581,8 +681,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[12], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 6. * amp_sv[0]; jamp_sv[6] += 1. / 2. * amp_sv[0]; @@ -595,8 +698,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 6. * amp_sv[0]; jamp_sv[10] += 1. / 2. * amp_sv[0]; @@ -609,8 +715,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -624,8 +733,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 2. * amp_sv[0]; jamp_sv[7] -= 1. / 6. * amp_sv[0]; @@ -638,8 +750,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[16] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * amp_sv[0]; jamp_sv[7] -= 1. / 6. * amp_sv[0]; @@ -652,8 +767,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 6. * amp_sv[0]; jamp_sv[6] += 1. / 2. * amp_sv[0]; @@ -666,8 +784,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * amp_sv[0]; jamp_sv[3] -= 1. / 6. * amp_sv[0]; @@ -680,8 +801,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -695,8 +819,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[20] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= 1. / 6. * amp_sv[0]; jamp_sv[9] += 1. / 2. * amp_sv[0]; @@ -709,8 +836,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * amp_sv[0]; jamp_sv[7] -= 1. / 6. * amp_sv[0]; @@ -723,8 +853,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * amp_sv[0]; jamp_sv[11] -= 1. / 6. * amp_sv[0]; @@ -737,8 +870,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += 1. / 2. * amp_sv[0]; jamp_sv[11] -= 1. / 6. * amp_sv[0]; @@ -751,8 +887,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 VVV1_0( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[24] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -765,8 +904,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 6. * amp_sv[0]; jamp_sv[9] += 1. / 2. * amp_sv[0]; @@ -779,8 +921,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -793,8 +938,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * amp_sv[0]; jamp_sv[2] += 1. / 2. * amp_sv[0]; @@ -807,8 +955,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -821,8 +972,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += 1. / 2. * amp_sv[0]; jamp_sv[11] -= 1. / 6. * amp_sv[0]; @@ -835,8 +989,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -849,8 +1006,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 32 FFV1_0( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[31] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[3] -= 1. / 6. * amp_sv[0]; @@ -863,8 +1023,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[12], w_fp[4], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -899,8 +1062,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 VVV1_0( w_fp[1], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -915,8 +1081,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -1614,7 +1783,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1644,11 +1812,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1723,38 +1892,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1933,11 +2071,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc index 1fdfcee1ed..4b9f028e6d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -360,7 +424,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -374,7 +438,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +464,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[1], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -416,8 +483,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -432,8 +502,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[5] += 1. / 2. * amp_sv[0]; @@ -448,8 +521,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -464,8 +540,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -481,8 +560,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -496,8 +578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[1], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -510,8 +595,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[5], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -524,8 +612,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -538,8 +629,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 VVV1_0( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -553,8 +647,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= 1. / 2. * amp_sv[0]; jamp_sv[11] += 1. / 6. * amp_sv[0]; @@ -567,8 +664,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[1], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[8] -= 1. / 2. * amp_sv[0]; @@ -581,8 +681,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[12], w_fp[5], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[7] -= 1. / 2. * amp_sv[0]; @@ -595,8 +698,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 2. * amp_sv[0]; jamp_sv[11] += 1. / 6. * amp_sv[0]; @@ -609,8 +715,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -624,8 +733,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 6. * amp_sv[0]; jamp_sv[7] -= 1. / 2. * amp_sv[0]; @@ -638,8 +750,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[16] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= 1. / 2. * amp_sv[0]; jamp_sv[6] += 1. / 6. * amp_sv[0]; @@ -652,8 +767,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += 1. / 6. * amp_sv[0]; jamp_sv[7] -= 1. / 2. * amp_sv[0]; @@ -666,8 +784,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[1], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += 1. / 6. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -680,8 +801,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -695,8 +819,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[20] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 6. * amp_sv[0]; jamp_sv[10] -= 1. / 2. * amp_sv[0]; @@ -709,8 +836,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[6] += 1. / 6. * amp_sv[0]; @@ -723,8 +853,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[9] += 1. / 6. * amp_sv[0]; @@ -737,8 +870,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[9], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += 1. / 6. * amp_sv[0]; jamp_sv[10] -= 1. / 2. * amp_sv[0]; @@ -751,8 +887,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[24] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -765,8 +904,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= 1. / 2. * amp_sv[0]; jamp_sv[11] += 1. / 6. * amp_sv[0]; @@ -779,8 +921,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -793,8 +938,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 6. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -807,8 +955,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -821,8 +972,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[1], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 2. * amp_sv[0]; jamp_sv[9] += 1. / 6. * amp_sv[0]; @@ -835,8 +989,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[1], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -849,8 +1006,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 32 FFV1_0( w_fp[10], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[31] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[4] += 1. / 6. * amp_sv[0]; @@ -863,8 +1023,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[12], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -899,8 +1062,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 VVV1_0( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -915,8 +1081,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -1614,7 +1783,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1644,11 +1812,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1723,38 +1892,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1933,11 +2071,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc index bc0fc369d4..451245bb01 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -360,7 +424,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -374,7 +438,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +464,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -416,8 +483,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -432,8 +502,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * amp_sv[0]; jamp_sv[7] += 1. / 2. * amp_sv[0]; @@ -448,8 +521,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -464,8 +540,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -481,8 +560,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 2. * amp_sv[0]; jamp_sv[6] -= 1. / 6. * amp_sv[0]; @@ -496,8 +578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -510,8 +595,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[7] += 1. / 2. * amp_sv[0]; @@ -524,8 +612,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= 1. / 6. * amp_sv[0]; jamp_sv[7] += 1. / 2. * amp_sv[0]; @@ -538,8 +629,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 VVV1_0( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -553,8 +647,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[9] -= 1. / 6. * amp_sv[0]; @@ -567,8 +664,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[11] += 1. / 2. * amp_sv[0]; @@ -581,8 +681,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 6. * amp_sv[0]; @@ -595,8 +698,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] -= 1. / 6. * amp_sv[0]; jamp_sv[11] += 1. / 2. * amp_sv[0]; @@ -609,8 +715,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -624,8 +733,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -638,8 +750,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[16] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[4] += 1. / 2. * amp_sv[0]; @@ -652,8 +767,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 6. * amp_sv[0]; @@ -666,8 +784,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 6. * amp_sv[0]; @@ -680,8 +801,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -695,8 +819,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[20] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[8] += 1. / 2. * amp_sv[0]; @@ -709,8 +836,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -723,8 +853,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 2. * amp_sv[0]; jamp_sv[10] -= 1. / 6. * amp_sv[0]; @@ -737,8 +870,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += 1. / 2. * amp_sv[0]; jamp_sv[10] -= 1. / 6. * amp_sv[0]; @@ -751,8 +887,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[24] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -765,8 +904,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += 1. / 2. * amp_sv[0]; jamp_sv[9] -= 1. / 6. * amp_sv[0]; @@ -779,8 +921,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -793,8 +938,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += 1. / 2. * amp_sv[0]; jamp_sv[6] -= 1. / 6. * amp_sv[0]; @@ -807,8 +955,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -821,8 +972,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= 1. / 6. * amp_sv[0]; jamp_sv[11] += 1. / 2. * amp_sv[0]; @@ -835,8 +989,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -849,8 +1006,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 32 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[31] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= 1. / 6. * amp_sv[0]; jamp_sv[7] += 1. / 2. * amp_sv[0]; @@ -863,8 +1023,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -899,8 +1062,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 VVV1_0( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -915,8 +1081,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -1614,7 +1783,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1644,11 +1812,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1723,38 +1892,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1933,11 +2071,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc index c691d758e7..772392b151 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -103,6 +104,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -362,7 +426,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -376,7 +440,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -402,8 +466,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -418,8 +485,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -434,8 +504,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -448,8 +521,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[5], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -464,8 +540,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[1], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 4. * amp_sv[0]; @@ -480,8 +559,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 4. * amp_sv[0]; @@ -496,8 +578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -1195,7 +1280,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1225,11 +1309,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1304,38 +1389,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1514,11 +1568,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc index ca438c57e9..111d7bd91e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -109,6 +110,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -368,7 +432,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -382,7 +446,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -408,8 +472,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -424,8 +491,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -440,8 +510,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -454,8 +527,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -470,8 +546,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -486,8 +565,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -502,8 +584,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -1201,7 +1286,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1231,11 +1315,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1310,38 +1395,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1520,11 +1574,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc index 7603295c95..1f57233a6f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -360,7 +424,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -374,7 +438,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +464,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -416,8 +483,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -432,8 +502,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -446,8 +519,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[5], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -462,8 +538,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[1], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 4. * amp_sv[0]; @@ -480,8 +559,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -496,8 +578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -512,8 +597,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 VVV1_0( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -526,8 +614,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -542,8 +633,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[1], w_fp[10], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -558,8 +652,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[10], w_fp[5], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -574,8 +671,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[10], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -590,8 +690,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[6], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 4. * amp_sv[0]; @@ -606,8 +709,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -1305,7 +1411,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1335,11 +1440,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1414,38 +1520,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1624,11 +1699,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc index 77d7eddc6c..e2779260ff 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -109,6 +110,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -368,7 +432,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -382,7 +446,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -408,8 +472,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -424,8 +491,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -440,8 +510,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -454,8 +527,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -470,8 +546,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -486,8 +565,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -502,8 +584,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[3], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -1201,7 +1286,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1231,11 +1315,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1310,38 +1395,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1520,11 +1574,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc index 4f2c72bff8..c105c712fd 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -360,7 +424,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -374,7 +438,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +464,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[0], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -416,8 +483,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -432,8 +502,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -448,8 +521,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -464,8 +540,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -481,8 +560,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[10] -= 1. / 2. * amp_sv[0]; @@ -496,8 +578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[0], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= 1. / 2. * amp_sv[0]; jamp_sv[11] += 1. / 6. * amp_sv[0]; @@ -510,8 +595,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[11] += 1. / 6. * amp_sv[0]; @@ -524,8 +612,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -538,8 +629,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 VVV1_0( w_fp[5], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -553,8 +647,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[9] -= 1. / 2. * amp_sv[0]; @@ -567,8 +664,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[0], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= 1. / 2. * amp_sv[0]; jamp_sv[11] += 1. / 6. * amp_sv[0]; @@ -581,8 +681,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] -= 1. / 2. * amp_sv[0]; jamp_sv[11] += 1. / 6. * amp_sv[0]; @@ -595,8 +698,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -609,8 +715,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[5], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -624,8 +733,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += 1. / 6. * amp_sv[0]; jamp_sv[9] -= 1. / 2. * amp_sv[0]; @@ -638,8 +750,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[16] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * amp_sv[0]; jamp_sv[8] += 1. / 6. * amp_sv[0]; @@ -652,8 +767,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 6. * amp_sv[0]; jamp_sv[9] -= 1. / 2. * amp_sv[0]; @@ -666,8 +784,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[0], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * amp_sv[0]; jamp_sv[6] += 1. / 6. * amp_sv[0]; @@ -680,8 +801,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[5], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -695,8 +819,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[20] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * amp_sv[0]; jamp_sv[8] += 1. / 6. * amp_sv[0]; @@ -709,8 +836,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += 1. / 6. * amp_sv[0]; jamp_sv[10] -= 1. / 2. * amp_sv[0]; @@ -723,8 +853,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += 1. / 6. * amp_sv[0]; jamp_sv[10] -= 1. / 2. * amp_sv[0]; @@ -737,8 +870,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * amp_sv[0]; jamp_sv[7] += 1. / 6. * amp_sv[0]; @@ -751,8 +887,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 VVV1_0( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[24] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -765,8 +904,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -779,8 +921,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -793,8 +938,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[4] -= 1. / 2. * amp_sv[0]; @@ -807,8 +955,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -821,8 +972,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[0], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= 1. / 2. * amp_sv[0]; jamp_sv[7] += 1. / 6. * amp_sv[0]; @@ -835,8 +989,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -849,8 +1006,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 32 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[31] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[6] += 1. / 6. * amp_sv[0]; @@ -863,8 +1023,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -899,8 +1062,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 VVV1_0( w_fp[5], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -915,8 +1081,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * amp_sv[0]; jamp_sv[4] -= 1. / 2. * amp_sv[0]; @@ -1614,7 +1783,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1644,11 +1812,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1723,38 +1892,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1933,11 +2071,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc index 65f0e5aaf4..915207bda3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -360,7 +424,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -374,7 +438,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +464,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -416,8 +483,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -432,8 +502,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -446,8 +519,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -462,8 +538,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -480,8 +559,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -496,8 +578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -512,8 +597,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 VVV1_0( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -526,8 +614,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -542,8 +633,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -558,8 +652,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -574,8 +671,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[10], w_fp[4], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -590,8 +690,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[6], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -606,8 +709,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -1305,7 +1411,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1335,11 +1440,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1414,38 +1520,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1624,11 +1699,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc index d938dc9999..895b1674ac 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -103,6 +104,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -362,7 +426,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -376,7 +440,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -402,8 +466,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -418,8 +485,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -434,8 +504,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -448,8 +521,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 36. * amp_sv[0]; @@ -464,8 +540,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -480,8 +559,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[0], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -496,8 +578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[3], w_fp[0], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 36. * amp_sv[0]; @@ -1195,7 +1280,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1225,11 +1309,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1304,38 +1389,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1514,11 +1568,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc index ef2de04fdb..10ccb38efa 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -360,7 +424,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -374,7 +438,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +464,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -416,8 +483,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -432,8 +502,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -446,8 +519,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 36. * amp_sv[0]; @@ -462,8 +538,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -480,8 +559,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -496,8 +578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -512,8 +597,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 VVV1_0( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -526,8 +614,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[0], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -542,8 +633,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -558,8 +652,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -574,8 +671,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[10], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -590,8 +690,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[6], w_fp[0], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -606,8 +709,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[10], w_fp[0], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 36. * amp_sv[0]; @@ -1305,7 +1411,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1335,11 +1440,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1414,38 +1520,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1624,11 +1699,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt index e728335e4c..0986c3df28 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,14 +49,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -72,7 +73,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.07860422134399414  +DEBUG: model prefixing takes 0.0343935489654541  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -87,7 +88,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 2.729 s +1 processes with 72 diagrams generated in 1.421 s Total: 1 processes with 72 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -98,10 +99,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False - INFO: initialize a new directory: CODEGEN_mad_smeft_gg_tttt INFO: remove old information in CODEGEN_mad_smeft_gg_tttt DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 @@ -113,25 +114,25 @@ FileWriter t t~ t t~ WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx -DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (72 diagrams) in 0.132 s -Wrote files for 119 helas calls in 0.360 s +DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (72 diagrams) in 0.069 s +Wrote files for 119 helas calls in 1.641 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.215 s +ALOHA: aloha creates 5 routines in 0.144 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 10 routines in 0.214 s +ALOHA: aloha creates 10 routines in 0.137 s VVV5 VVV5 FFV1 @@ -141,32 +142,34 @@ ALOHA: aloha creates 10 routines in 0.214 s VVVV1 VVVV9 VVVV10 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README Run "open index.html" to see more information about this process. quit -real 0m5.833s -user 0m5.426s -sys 0m0.391s -Code generation completed in 6 seconds +real 0m9.061s +user 0m3.272s +sys 0m0.694s +Code generation completed in 9 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -187,9 +190,9 @@ Code generation completed in 6 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -216,9 +219,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/ident_card.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/ident_card.dat index 7758c3603b..ee875f040f 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/ident_card.dat +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/ident_card.dat @@ -216,17 +216,19 @@ decay 23 mdl_WZ decay 24 mdl_WW decay 25 mdl_WH decay 6 mdl_WT +mass 1 mdl_MD mass 11 mdl_Me mass 13 mdl_MMU mass 15 mdl_MTA -mass 1 mdl_MD +mass 2 mdl_MU mass 23 mdl_MZ mass 25 mdl_MH -mass 2 mdl_MU mass 3 mdl_MS mass 4 mdl_MC mass 5 mdl_MB mass 6 mdl_MT +smeft 1 mdl_cG +smeft 10 mdl_cuHRe smeft 100 mdl_ceHRe smeft 101 mdl_ceWRe smeft 102 mdl_ceBRe @@ -237,7 +239,7 @@ smeft 106 mdl_cll smeft 107 mdl_cll1 smeft 108 mdl_clj1 smeft 109 mdl_clj3 -smeft 10 mdl_cuHRe +smeft 11 mdl_ctHRe smeft 110 mdl_cQl1 smeft 111 mdl_cQl3 smeft 112 mdl_cee @@ -248,7 +250,7 @@ smeft 116 mdl_cbe smeft 117 mdl_cje smeft 118 mdl_cQe smeft 119 mdl_clu -smeft 11 mdl_ctHRe +smeft 12 mdl_cdHRe smeft 120 mdl_ctl smeft 121 mdl_cld smeft 122 mdl_cbl @@ -259,7 +261,6 @@ smeft 126 mdl_cleju1Re smeft 127 mdl_cleQt1Re smeft 128 mdl_cleju3Re smeft 129 mdl_cleQt3Re -smeft 12 mdl_cdHRe smeft 13 mdl_cbHRe smeft 14 mdl_cuGRe smeft 15 mdl_ctGRe @@ -267,7 +268,7 @@ smeft 16 mdl_cuWRe smeft 17 mdl_ctWRe smeft 18 mdl_cuBRe smeft 19 mdl_ctBRe -smeft 1 mdl_cG +smeft 2 mdl_cW smeft 20 mdl_cdGRe smeft 21 mdl_cbGRe smeft 22 mdl_cdWRe @@ -278,7 +279,7 @@ smeft 26 mdl_cHj1 smeft 27 mdl_cHQ1 smeft 28 mdl_cHj3 smeft 29 mdl_cHQ3 -smeft 2 mdl_cW +smeft 3 mdl_cH smeft 30 mdl_cHu smeft 31 mdl_cHt smeft 32 mdl_cHd @@ -289,7 +290,7 @@ smeft 36 mdl_cjj11 smeft 37 mdl_cjj18 smeft 38 mdl_cjj31 smeft 39 mdl_cjj38 -smeft 3 mdl_cH +smeft 4 mdl_cHbox smeft 40 mdl_cQj11 smeft 41 mdl_cQj18 smeft 42 mdl_cQj31 @@ -300,7 +301,7 @@ smeft 46 mdl_cuu1 smeft 47 mdl_cuu8 smeft 48 mdl_ctt smeft 49 mdl_ctu1 -smeft 4 mdl_cHbox +smeft 5 mdl_cHDD smeft 50 mdl_ctu8 smeft 51 mdl_cdd1 smeft 52 mdl_cdd8 @@ -311,7 +312,7 @@ smeft 56 mdl_cud1 smeft 57 mdl_ctb1 smeft 58 mdl_ctd1 smeft 59 mdl_cbu1 -smeft 5 mdl_cHDD +smeft 6 mdl_cHG smeft 60 mdl_cud8 smeft 61 mdl_ctb8 smeft 62 mdl_ctd8 @@ -322,7 +323,7 @@ smeft 66 mdl_cju1 smeft 67 mdl_cQu1 smeft 68 mdl_cju8 smeft 69 mdl_cQu8 -smeft 6 mdl_cHG +smeft 7 mdl_cHW smeft 70 mdl_ctj1 smeft 71 mdl_ctj8 smeft 72 mdl_cQt1 @@ -333,7 +334,7 @@ smeft 76 mdl_cQd1 smeft 77 mdl_cQd8 smeft 78 mdl_cbj1 smeft 79 mdl_cbj8 -smeft 7 mdl_cHW +smeft 8 mdl_cHB smeft 80 mdl_cQb1 smeft 81 mdl_cQb8 smeft 82 mdl_cjQtu1Re @@ -344,7 +345,7 @@ smeft 86 mdl_cjujd1Re smeft 87 mdl_cjujd8Re smeft 88 mdl_cjujd11Re smeft 89 mdl_cjujd81Re -smeft 8 mdl_cHB +smeft 9 mdl_cHWB smeft 90 mdl_cQtjd1Re smeft 91 mdl_cQtjd8Re smeft 92 mdl_cjuQb1Re @@ -355,7 +356,7 @@ smeft 96 mdl_cjtQd1Re smeft 97 mdl_cjtQd8Re smeft 98 mdl_cQtQb1Re smeft 99 mdl_cQtQb8Re -smeft 9 mdl_cHWB +smeftcpv 1 mdl_cGtil smeftcpv 10 mdl_ctWIm smeftcpv 11 mdl_cuBIm smeftcpv 12 mdl_ctBIm @@ -366,7 +367,7 @@ smeftcpv 16 mdl_cbWIm smeftcpv 17 mdl_cdBIm smeftcpv 18 mdl_cbBIm smeftcpv 19 mdl_cuHIm -smeftcpv 1 mdl_cGtil +smeftcpv 2 mdl_cWtil smeftcpv 20 mdl_ctHIm smeftcpv 21 mdl_cdHIm smeftcpv 22 mdl_cbHIm @@ -377,7 +378,7 @@ smeftcpv 26 mdl_cutbd8Im smeftcpv 27 mdl_cjQtu1Im smeftcpv 28 mdl_cjQtu8Im smeftcpv 29 mdl_cjQbd1Im -smeftcpv 2 mdl_cWtil +smeftcpv 3 mdl_cHGtil smeftcpv 30 mdl_cjQbd8Im smeftcpv 31 mdl_cjujd1Im smeftcpv 32 mdl_cjujd8Im @@ -388,7 +389,7 @@ smeftcpv 36 mdl_cQtjd8Im smeftcpv 37 mdl_cjuQb1Im smeftcpv 38 mdl_cjuQb8Im smeftcpv 39 mdl_cQujb1Im -smeftcpv 3 mdl_cHGtil +smeftcpv 4 mdl_cHWtil smeftcpv 40 mdl_cQujb8Im smeftcpv 41 mdl_cjtQd1Im smeftcpv 42 mdl_cjtQd8Im @@ -399,12 +400,11 @@ smeftcpv 46 mdl_ceWIm smeftcpv 47 mdl_ceBIm smeftcpv 48 mdl_cledjIm smeftcpv 49 mdl_clebQIm -smeftcpv 4 mdl_cHWtil +smeftcpv 5 mdl_cHBtil smeftcpv 50 mdl_cleju1Im smeftcpv 51 mdl_cleju3Im smeftcpv 52 mdl_cleQt1Im smeftcpv 53 mdl_cleQt3Im -smeftcpv 5 mdl_cHBtil smeftcpv 6 mdl_cHWBtil smeftcpv 7 mdl_cuGIm smeftcpv 8 mdl_ctGIm @@ -414,10 +414,10 @@ sminputs 1 mdl_MW sminputs 2 mdl_Gf sminputs 3 aS switches 1 mdl_linearPropCorrections +yukawa 1 mdl_ymdo yukawa 11 mdl_yme yukawa 13 mdl_ymm yukawa 15 mdl_ymtau -yukawa 1 mdl_ymdo yukawa 2 mdl_ymup yukawa 3 mdl_yms yukawa 4 mdl_ymc diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts b/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/param_card.inc b/epochX/cudacpp/smeft_gg_tttt.mad/Source/param_card.inc index dab3aac603..e7bc7ae438 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/param_card.inc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/param_card.inc @@ -2,17 +2,19 @@ MDL_WW = 2.085000D+00 MDL_WH = 4.070000D-03 MDL_WT = 1.330000D+00 + MDL_MD = 4.670000D-03 MDL_ME = 5.110000D-04 MDL_MMU = 1.056600D-01 MDL_MTA = 1.777000D+00 - MDL_MD = 4.670000D-03 + MDL_MU = 2.160000D-03 MDL_MZ = 9.118760D+01 MDL_MH = 1.250900D+02 - MDL_MU = 2.160000D-03 MDL_MS = 9.300000D-02 MDL_MC = 1.270000D+00 MDL_MB = 4.180000D+00 MDL_MT = 1.727600D+02 + MDL_CG = 0.000000D+00 + MDL_CUHRE = 0.000000D+00 MDL_CEHRE = 0.000000D+00 MDL_CEWRE = 0.000000D+00 MDL_CEBRE = 0.000000D+00 @@ -23,7 +25,7 @@ MDL_CLL1 = 0.000000D+00 MDL_CLJ1 = 0.000000D+00 MDL_CLJ3 = 0.000000D+00 - MDL_CUHRE = 0.000000D+00 + MDL_CTHRE = 0.000000D+00 MDL_CQL1 = 0.000000D+00 MDL_CQL3 = 0.000000D+00 MDL_CEE = 0.000000D+00 @@ -34,7 +36,7 @@ MDL_CJE = 0.000000D+00 MDL_CQE = 0.000000D+00 MDL_CLU = 0.000000D+00 - MDL_CTHRE = 0.000000D+00 + MDL_CDHRE = 0.000000D+00 MDL_CTL = 0.000000D+00 MDL_CLD = 0.000000D+00 MDL_CBL = 0.000000D+00 @@ -45,7 +47,6 @@ MDL_CLEQT1RE = 0.000000D+00 MDL_CLEJU3RE = 0.000000D+00 MDL_CLEQT3RE = 0.000000D+00 - MDL_CDHRE = 0.000000D+00 MDL_CBHRE = 0.000000D+00 MDL_CUGRE = 0.000000D+00 MDL_CTGRE = 0.000000D+00 @@ -53,7 +54,7 @@ MDL_CTWRE = 0.000000D+00 MDL_CUBRE = 0.000000D+00 MDL_CTBRE = 0.000000D+00 - MDL_CG = 0.000000D+00 + MDL_CW = 0.000000D+00 MDL_CDGRE = 0.000000D+00 MDL_CBGRE = 0.000000D+00 MDL_CDWRE = 0.000000D+00 @@ -64,7 +65,7 @@ MDL_CHQ1 = 0.000000D+00 MDL_CHJ3 = 0.000000D+00 MDL_CHQ3 = 0.000000D+00 - MDL_CW = 0.000000D+00 + MDL_CH = 0.000000D+00 MDL_CHU = 0.000000D+00 MDL_CHT = 0.000000D+00 MDL_CHD = 0.000000D+00 @@ -75,7 +76,7 @@ MDL_CJJ18 = 0.000000D+00 MDL_CJJ31 = 0.000000D+00 MDL_CJJ38 = 0.000000D+00 - MDL_CH = 0.000000D+00 + MDL_CHBOX = 0.000000D+00 MDL_CQJ11 = 0.000000D+00 MDL_CQJ18 = 0.000000D+00 MDL_CQJ31 = 0.000000D+00 @@ -86,7 +87,7 @@ MDL_CUU8 = 0.000000D+00 MDL_CTT = 0.000000D+00 MDL_CTU1 = 0.000000D+00 - MDL_CHBOX = 0.000000D+00 + MDL_CHDD = 0.000000D+00 MDL_CTU8 = 0.000000D+00 MDL_CDD1 = 0.000000D+00 MDL_CDD8 = 0.000000D+00 @@ -97,7 +98,7 @@ MDL_CTB1 = 0.000000D+00 MDL_CTD1 = 0.000000D+00 MDL_CBU1 = 0.000000D+00 - MDL_CHDD = 0.000000D+00 + MDL_CHG = 0.000000D+00 MDL_CUD8 = 0.000000D+00 MDL_CTB8 = 0.000000D+00 MDL_CTD8 = 0.000000D+00 @@ -108,7 +109,7 @@ MDL_CQU1 = 0.000000D+00 MDL_CJU8 = 0.000000D+00 MDL_CQU8 = 0.000000D+00 - MDL_CHG = 0.000000D+00 + MDL_CHW = 0.000000D+00 MDL_CTJ1 = 0.000000D+00 MDL_CTJ8 = 0.000000D+00 MDL_CQT1 = 0.000000D+00 @@ -119,7 +120,7 @@ MDL_CQD8 = 0.000000D+00 MDL_CBJ1 = 0.000000D+00 MDL_CBJ8 = 0.000000D+00 - MDL_CHW = 0.000000D+00 + MDL_CHB = 0.000000D+00 MDL_CQB1 = 0.000000D+00 MDL_CQB8 = 0.000000D+00 MDL_CJQTU1RE = 0.000000D+00 @@ -130,7 +131,7 @@ MDL_CJUJD8RE = 0.000000D+00 MDL_CJUJD11RE = 0.000000D+00 MDL_CJUJD81RE = 0.000000D+00 - MDL_CHB = 0.000000D+00 + MDL_CHWB = 0.000000D+00 MDL_CQTJD1RE = 0.000000D+00 MDL_CQTJD8RE = 0.000000D+00 MDL_CJUQB1RE = 0.000000D+00 @@ -141,7 +142,7 @@ MDL_CJTQD8RE = 0.000000D+00 MDL_CQTQB1RE = 0.000000D+00 MDL_CQTQB8RE = 0.000000D+00 - MDL_CHWB = 0.000000D+00 + MDL_CGTIL = 0.000000D+00 MDL_CTWIM = 0.000000D+00 MDL_CUBIM = 0.000000D+00 MDL_CTBIM = 0.000000D+00 @@ -152,7 +153,7 @@ MDL_CDBIM = 0.000000D+00 MDL_CBBIM = 0.000000D+00 MDL_CUHIM = 0.000000D+00 - MDL_CGTIL = 0.000000D+00 + MDL_CWTIL = 0.000000D+00 MDL_CTHIM = 0.000000D+00 MDL_CDHIM = 0.000000D+00 MDL_CBHIM = 0.000000D+00 @@ -163,7 +164,7 @@ MDL_CJQTU1IM = 0.000000D+00 MDL_CJQTU8IM = 0.000000D+00 MDL_CJQBD1IM = 0.000000D+00 - MDL_CWTIL = 0.000000D+00 + MDL_CHGTIL = 0.000000D+00 MDL_CJQBD8IM = 0.000000D+00 MDL_CJUJD1IM = 0.000000D+00 MDL_CJUJD8IM = 0.000000D+00 @@ -174,7 +175,7 @@ MDL_CJUQB1IM = 0.000000D+00 MDL_CJUQB8IM = 0.000000D+00 MDL_CQUJB1IM = 0.000000D+00 - MDL_CHGTIL = 0.000000D+00 + MDL_CHWTIL = 0.000000D+00 MDL_CQUJB8IM = 0.000000D+00 MDL_CJTQD1IM = 0.000000D+00 MDL_CJTQD8IM = 0.000000D+00 @@ -185,12 +186,11 @@ MDL_CEBIM = 0.000000D+00 MDL_CLEDJIM = 0.000000D+00 MDL_CLEBQIM = 0.000000D+00 - MDL_CHWTIL = 0.000000D+00 + MDL_CHBTIL = 0.000000D+00 MDL_CLEJU1IM = 0.000000D+00 MDL_CLEJU3IM = 0.000000D+00 MDL_CLEQT1IM = 0.000000D+00 MDL_CLEQT3IM = 0.000000D+00 - MDL_CHBTIL = 0.000000D+00 MDL_CHWBTIL = 0.000000D+00 MDL_CUGIM = 0.000000D+00 MDL_CTGIM = 0.000000D+00 @@ -200,10 +200,10 @@ MDL_GF = 1.166379D-05 AS = 1.179000D-01 MDL_LINEARPROPCORRECTIONS = 0.000000D+00 + MDL_YMDO = 4.670000D-03 MDL_YME = 5.110000D-04 MDL_YMM = 1.056600D-01 MDL_YMTAU = 1.777000D+00 - MDL_YMDO = 4.670000D-03 MDL_YMUP = 2.160000D-03 MDL_YMS = 9.300000D-02 MDL_YMC = 1.270000D+00 diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h index 24800c08c9..50496fa2bf 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc index 84ba0de9b4..1eb18a90d3 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -397,8 +461,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -413,8 +480,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -429,8 +499,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV5_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -446,8 +519,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -462,8 +538,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[3], w_fp[8], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -478,8 +557,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 VVV5_0( w_fp[6], w_fp[11], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -494,8 +576,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -510,8 +595,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -526,8 +614,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[13], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -542,8 +633,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -560,8 +654,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[5] += 1. / 6. * amp_sv[0]; @@ -574,8 +671,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[13], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * amp_sv[0]; jamp_sv[5] += 1. / 2. * amp_sv[0]; @@ -589,8 +689,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[5], w_fp[6], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 6. * amp_sv[0]; jamp_sv[5] += 1. / 2. * amp_sv[0]; @@ -603,8 +706,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * amp_sv[0]; jamp_sv[5] += 1. / 6. * amp_sv[0]; @@ -617,8 +723,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 FFV1_0( w_fp[15], w_fp[4], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[4] -= 1. / 6. * amp_sv[0]; @@ -631,8 +740,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[15], w_fp[12], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[4] -= 1. / 2. * amp_sv[0]; @@ -645,8 +757,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[5], w_fp[16], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[16] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -659,8 +774,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 VVV5_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -673,8 +791,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * amp_sv[0]; jamp_sv[1] += 1. / 2. * amp_sv[0]; @@ -687,8 +808,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV5_0( w_fp[1], w_fp[10], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -703,8 +827,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[5], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[20] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 6. * amp_sv[0]; jamp_sv[8] -= 1. / 2. * amp_sv[0]; @@ -717,8 +844,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[14], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 2. * amp_sv[0]; jamp_sv[8] -= 1. / 6. * amp_sv[0]; @@ -731,8 +861,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 FFV1_0( w_fp[5], w_fp[6], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 6. * amp_sv[0]; jamp_sv[10] += 1. / 2. * amp_sv[0]; @@ -745,8 +878,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[14], w_fp[6], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * amp_sv[0]; jamp_sv[10] += 1. / 6. * amp_sv[0]; @@ -759,8 +895,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 FFV1_0( w_fp[15], w_fp[4], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[24] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 6. * amp_sv[0]; jamp_sv[6] += 1. / 2. * amp_sv[0]; @@ -773,8 +912,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * amp_sv[0]; jamp_sv[6] += 1. / 6. * amp_sv[0]; @@ -787,8 +929,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[17], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 2. * amp_sv[0]; jamp_sv[10] += 1. / 6. * amp_sv[0]; @@ -801,8 +946,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 VVV5_0( w_fp[1], w_fp[11], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -815,8 +963,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[17], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 6. * amp_sv[0]; jamp_sv[10] += 1. / 2. * amp_sv[0]; @@ -829,8 +980,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 VVV5_0( w_fp[1], w_fp[10], w_fp[12], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -844,8 +998,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[5], w_fp[16], w_fp[17], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 6. * amp_sv[0]; jamp_sv[7] -= 1. / 2. * amp_sv[0]; @@ -858,8 +1015,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 32 FFV1_0( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[31] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 2. * amp_sv[0]; jamp_sv[7] -= 1. / 6. * amp_sv[0]; @@ -872,8 +1032,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * amp_sv[0]; jamp_sv[7] -= 1. / 6. * amp_sv[0]; @@ -886,8 +1049,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 34 FFV1_0( w_fp[13], w_fp[12], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[33] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[7] -= 1. / 2. * amp_sv[0]; @@ -900,8 +1066,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 FFV1_0( w_fp[15], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * amp_sv[0]; jamp_sv[6] += 1. / 6. * amp_sv[0]; @@ -914,8 +1083,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 FFV1_0( w_fp[15], w_fp[12], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 6. * amp_sv[0]; jamp_sv[6] += 1. / 2. * amp_sv[0]; @@ -928,8 +1100,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 37 FFV1_0( w_fp[5], w_fp[14], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[36] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * amp_sv[0]; jamp_sv[3] -= 1. / 6. * amp_sv[0]; @@ -942,8 +1117,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 38 VVV5_0( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[37] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -956,8 +1134,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 39 FFV1_0( w_fp[3], w_fp[14], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[38] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -970,8 +1151,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 40 VVV5_0( w_fp[1], w_fp[11], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[39] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -985,8 +1169,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 41 FFV1_0( w_fp[3], w_fp[16], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[40] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= 1. / 6. * amp_sv[0]; jamp_sv[9] += 1. / 2. * amp_sv[0]; @@ -999,8 +1186,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 42 FFV1_0( w_fp[17], w_fp[16], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[41] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= 1. / 2. * amp_sv[0]; jamp_sv[9] += 1. / 6. * amp_sv[0]; @@ -1013,8 +1203,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 43 FFV1_0( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[42] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 6. * amp_sv[0]; jamp_sv[7] -= 1. / 2. * amp_sv[0]; @@ -1027,8 +1220,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 44 FFV1_0( w_fp[13], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[43] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * amp_sv[0]; jamp_sv[7] -= 1. / 6. * amp_sv[0]; @@ -1041,8 +1237,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 45 FFV1_0( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[44] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 6. * amp_sv[0]; jamp_sv[11] -= 1. / 2. * amp_sv[0]; @@ -1055,8 +1254,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 46 FFV1_0( w_fp[17], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[45] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * amp_sv[0]; jamp_sv[11] -= 1. / 6. * amp_sv[0]; @@ -1069,8 +1271,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 47 FFV1_0( w_fp[12], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[46] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += 1. / 2. * amp_sv[0]; jamp_sv[11] -= 1. / 6. * amp_sv[0]; @@ -1083,8 +1288,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 48 VVV5_0( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 48 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[47] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1097,8 +1305,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 49 FFV1_0( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[48] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += 1. / 6. * amp_sv[0]; jamp_sv[11] -= 1. / 2. * amp_sv[0]; @@ -1111,8 +1322,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 50 VVV5_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[49] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1125,8 +1339,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 51 FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[50] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 2. * amp_sv[0]; jamp_sv[9] += 1. / 6. * amp_sv[0]; @@ -1139,8 +1356,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 52 FFV1_0( w_fp[5], w_fp[16], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[51] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1153,8 +1373,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 53 FFV1_0( w_fp[3], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[52] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 6. * amp_sv[0]; jamp_sv[9] += 1. / 2. * amp_sv[0]; @@ -1167,8 +1390,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 54 FFV1_0( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[53] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1181,8 +1407,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 55 FFV1_0( w_fp[16], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[54] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -1195,8 +1424,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 56 FFV1_0( w_fp[13], w_fp[4], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[55] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1209,8 +1441,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 57 FFV1_0( w_fp[16], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[56] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * amp_sv[0]; jamp_sv[2] += 1. / 2. * amp_sv[0]; @@ -1223,8 +1458,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 58 FFV1_0( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 58 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[57] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1237,8 +1475,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 59 FFV1_0( w_fp[5], w_fp[13], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[58] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += 1. / 2. * amp_sv[0]; jamp_sv[11] -= 1. / 6. * amp_sv[0]; @@ -1251,8 +1492,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 60 FFV1_0( w_fp[5], w_fp[6], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[59] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1265,8 +1509,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 61 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[60] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += 1. / 6. * amp_sv[0]; jamp_sv[11] -= 1. / 2. * amp_sv[0]; @@ -1279,8 +1526,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 62 FFV1_0( w_fp[3], w_fp[6], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[61] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1293,8 +1543,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 63 FFV1_0( w_fp[6], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[62] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[3] -= 1. / 6. * amp_sv[0]; @@ -1307,8 +1560,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 64 FFV1_0( w_fp[15], w_fp[4], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[63] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1321,8 +1577,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 65 FFV1_0( w_fp[6], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[64] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -1335,8 +1594,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 66 FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[65] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1371,8 +1633,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 68 VVV5_0( w_fp[1], w_fp[10], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[67] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -1387,8 +1652,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 69 VVV5_0( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[68] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -1425,8 +1693,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 71 VVV5_0( w_fp[1], w_fp[8], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[70] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[4] += 1. / 2. * amp_sv[0]; @@ -1441,8 +1712,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 72 VVV5_0( w_fp[1], w_fp[11], w_fp[12], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[71] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * amp_sv[0]; jamp_sv[4] += 1. / 2. * amp_sv[0]; @@ -2140,7 +2414,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -2170,11 +2443,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -2249,38 +2523,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -2459,11 +2702,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt index 065f7b4329..f6defe3d98 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,49 +49,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t -INFO: download model from http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz to the following directory: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/models  ---2025-10-22 11:49:03-- http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz -Resolving feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)... 130.104.48.109 -Connecting to feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)|130.104.48.109|:80... connected. -HTTP request sent, awaiting response... 200 Ok -Length: 80562 (79K) [application/x-tar] -Saving to: ‘tmp.tgz’ - - 0K .......... .......... .......... .......... .......... 63% 830K 0s - 50K .......... .......... ........ 100% 124M=0.06s - -2025-10-22 11:49:03 (1.27 MB/s) - ‘tmp.tgz’ saved [80562/80562] - -SMEFTsim_topU3l_MwScheme_UFO/ -SMEFTsim_topU3l_MwScheme_UFO/__init__.py -SMEFTsim_topU3l_MwScheme_UFO/param_card_massless.dat -SMEFTsim_topU3l_MwScheme_UFO/CT_couplings.py -SMEFTsim_topU3l_MwScheme_UFO/particles.py -SMEFTsim_topU3l_MwScheme_UFO/write_param_card.py -SMEFTsim_topU3l_MwScheme_UFO/decays.py -SMEFTsim_topU3l_MwScheme_UFO/parameters.py -SMEFTsim_topU3l_MwScheme_UFO/restrict_massless.dat -SMEFTsim_topU3l_MwScheme_UFO/object_library.py -SMEFTsim_topU3l_MwScheme_UFO/coupling_orders.py -SMEFTsim_topU3l_MwScheme_UFO/version.info -SMEFTsim_topU3l_MwScheme_UFO/function_library.py -SMEFTsim_topU3l_MwScheme_UFO/couplings.py -SMEFTsim_topU3l_MwScheme_UFO/propagators.py -SMEFTsim_topU3l_MwScheme_UFO/lorentz.py -SMEFTsim_topU3l_MwScheme_UFO/vertices.py -SMEFTsim_topU3l_MwScheme_UFO/restrict_SMlimit_massless.dat -fail to load model but auto_convert_model is on True. Trying to convert the model -convert model /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/models/SMEFTsim_topU3l_MwScheme_UFO -retry the load of the model +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -107,7 +73,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.07803130149841309  +DEBUG: model prefixing takes 0.034352779388427734  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -116,22 +82,19 @@ Defined multiparticle l- = e- mu- Defined multiparticle vl = ve vm vt Defined multiparticle vl~ = ve~ vm~ vt~ Defined multiparticle all = g a ve vm vt ve~ vm~ vt~ u c t d s b t1 u~ c~ t~ d~ s~ b~ t1~ z w+ z1 w1+ h h1 w- w1- e- mu- ta- e+ mu+ ta+ -INFO: Change particles name to pass to MG5 convention -Kept definitions of multiparticles p / j / l+ / l- / vl / vl~ unchanged -Defined multiparticle all = g a ve vm vt ve~ vm~ vt~ u c t d s b t1 u~ c~ t~ d~ s~ b~ t1~ z w+ z1 w1+ h h1 w- w1- e- mu- ta- e+ mu+ ta+ generate g g > t t~ t t~ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 2.695 s +1 processes with 72 diagrams generated in 1.410 s Total: 1 processes with 72 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 @@ -140,18 +103,18 @@ INFO: Processing color information for process: g g > t t~ t t~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. -Generated helas calls for 1 subprocesses (72 diagrams) in 0.127 s +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. +Generated helas calls for 1 subprocesses (72 diagrams) in 0.068 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.281 s +ALOHA: aloha creates 5 routines in 0.139 s VVV5 VVV5 FFV1 @@ -161,17 +124,17 @@ ALOHA: aloha creates 5 routines in 0.281 s VVVV1 VVVV9 VVVV10 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. quit -real 0m4.417s -user 0m3.862s -sys 0m0.114s -Code generation completed in 5 seconds +real 0m3.134s +user 0m2.098s +sys 0m0.123s +Code generation completed in 3 seconds diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h index 24800c08c9..50496fa2bf 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc index 8c3316992a..6d552137f3 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -2088,7 +2152,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -2118,11 +2181,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -2197,38 +2261,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -2407,11 +2440,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 01968dc817..30a2e623b3 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -549,7 +550,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.071 s +1 processes with 6 diagrams generated in 0.079 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -560,10 +561,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False -- INFO: initialize a new directory: CODEGEN_mad_susy_gg_t1t1 INFO: remove old information in CODEGEN_mad_susy_gg_t1t1 DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 @@ -575,52 +576,54 @@ FileWriter t1 t1~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (6 diagrams) in 0.007 s -Wrote files for 16 helas calls in 0.065 s +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (6 diagrams) in 0.005 s +Wrote files for 16 helas calls in 1.460 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.125 s +ALOHA: aloha creates 3 routines in 0.094 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.118 s +ALOHA: aloha creates 6 routines in 0.095 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README Run "open index.html" to see more information about this process. quit -real 0m2.714s -user 0m2.329s -sys 0m0.381s -Code generation completed in 3 seconds +real 0m8.992s +user 0m1.869s +sys 0m0.755s +Code generation completed in 9 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -641,9 +644,9 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -670,9 +673,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/ident_card.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/ident_card.dat index 9cfb7ac1a2..0c5a1bdd83 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/ident_card.dat +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/ident_card.dat @@ -232,9 +232,9 @@ mse2 3 3 mdl_RmE23x3 msl2 1 1 mdl_RmL21x1 msl2 3 3 mdl_RmL23x3 msoft 1 mdl_RMx1 +msoft 2 mdl_RMx2 msoft 21 mdl_mHd2 msoft 22 mdl_mHu2 -msoft 2 mdl_RMx2 msoft 3 mdl_RMx3 msq2 1 1 mdl_RmQ21x1 msq2 3 3 mdl_RmQ23x3 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/param_card.inc b/epochX/cudacpp/susy_gg_t1t1.mad/Source/param_card.inc index 6acb037f00..a3d72e8ed8 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/param_card.inc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/param_card.inc @@ -81,9 +81,9 @@ MDL_RML21X1 = 3.815567D+04 MDL_RML23X3 = 3.782868D+04 MDL_RMX1 = 1.013965D+02 + MDL_RMX2 = 1.915042D+02 MDL_MHD2 = 3.233749D+04 MDL_MHU2 = -1.288001D+05 - MDL_RMX2 = 1.915042D+02 MDL_RMX3 = 5.882630D+02 MDL_RMQ21X1 = 2.998367D+05 MDL_RMQ23X3 = 2.487654D+05 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h index c5e79dc1b1..76849a871e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_MSSM_SLHA2.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc index 89c03a7876..2ddd4b8cc9 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +464,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 VSS1_0( w_fp[4], w_fp[3], w_fp[2], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; @@ -414,8 +481,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VSS1_0( w_fp[1], w_fp[3], w_fp[4], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; @@ -427,8 +497,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 VSS1_0( w_fp[1], w_fp[3], w_fp[4], COUPs[3], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; @@ -440,8 +513,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VSS1_0( w_fp[1], w_fp[4], w_fp[2], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; @@ -453,8 +529,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 VSS1_0( w_fp[1], w_fp[2], w_fp[4], COUPs[3], -1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; @@ -1085,7 +1164,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1115,11 +1193,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1194,38 +1273,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1404,11 +1452,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt index 0c5c2efcaf..9b4adc99fd 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -549,13 +550,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.074 s +1 processes with 6 diagrams generated in 0.072 s Total: 1 processes with 6 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 @@ -564,32 +565,32 @@ INFO: Processing color information for process: g g > t1 t1~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. -Generated helas calls for 1 subprocesses (6 diagrams) in 0.006 s +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. +Generated helas calls for 1 subprocesses (6 diagrams) in 0.005 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.126 s +ALOHA: aloha creates 3 routines in 0.088 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. quit -real 0m1.007s -user 0m0.940s -sys 0m0.062s -Code generation completed in 1 seconds +real 0m2.530s +user 0m0.769s +sys 0m0.160s +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h index c5e79dc1b1..76849a871e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_MSSM_SLHA2.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc index e4718e0681..6950507444 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -1086,7 +1150,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1116,11 +1179,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1195,38 +1259,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1405,11 +1438,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt index 463187a10a..d22dd2464e 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -549,7 +550,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.089 s +1 processes with 3 diagrams generated in 0.093 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -560,10 +561,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --ve INFO: initialize a new directory: CODEGEN_mad_susy_gg_tt INFO: remove old information in CODEGEN_mad_susy_gg_tt DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -575,49 +576,51 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s -Wrote files for 10 helas calls in 0.076 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s +Wrote files for 10 helas calls in 1.316 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.123 s +ALOHA: aloha creates 2 routines in 0.076 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.120 s +ALOHA: aloha creates 4 routines in 0.062 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m3.218s -user 0m2.778s -sys 0m0.430s -Code generation completed in 3 seconds +real 0m8.501s +user 0m1.813s +sys 0m0.727s +Code generation completed in 8 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -638,9 +641,9 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -667,9 +670,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/ident_card.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/ident_card.dat index 9cfb7ac1a2..0c5a1bdd83 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/ident_card.dat +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/ident_card.dat @@ -232,9 +232,9 @@ mse2 3 3 mdl_RmE23x3 msl2 1 1 mdl_RmL21x1 msl2 3 3 mdl_RmL23x3 msoft 1 mdl_RMx1 +msoft 2 mdl_RMx2 msoft 21 mdl_mHd2 msoft 22 mdl_mHu2 -msoft 2 mdl_RMx2 msoft 3 mdl_RMx3 msq2 1 1 mdl_RmQ21x1 msq2 3 3 mdl_RmQ23x3 diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts b/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/param_card.inc b/epochX/cudacpp/susy_gg_tt.mad/Source/param_card.inc index 6acb037f00..a3d72e8ed8 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/param_card.inc +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/param_card.inc @@ -81,9 +81,9 @@ MDL_RML21X1 = 3.815567D+04 MDL_RML23X3 = 3.782868D+04 MDL_RMX1 = 1.013965D+02 + MDL_RMX2 = 1.915042D+02 MDL_MHD2 = 3.233749D+04 MDL_MHU2 = -1.288001D+05 - MDL_RMX2 = 1.915042D+02 MDL_RMX3 = 5.882630D+02 MDL_RMQ21X1 = 2.998367D+05 MDL_RMQ23X3 = 2.487654D+05 diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h index c5e79dc1b1..76849a871e 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_MSSM_SLHA2.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 98722d3089..be603f5cda 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -391,8 +455,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], -1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -405,8 +472,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], -1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -418,8 +488,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], -1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -1062,7 +1135,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1092,11 +1164,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1171,38 +1244,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1381,11 +1423,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 9c4080b86d..6416d0cc59 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,15 +49,12 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F import model MSSM_SLHA2 -INFO: load particles -INFO: load vertices -DEBUG: model prefixing takes 0.6192381381988525  INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Simplifying conditional expressions  @@ -552,13 +550,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.063 s +1 processes with 3 diagrams generated in 0.070 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -567,30 +565,30 @@ INFO: Processing color information for process: g g > t t~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. -Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. +Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.095 s +ALOHA: aloha creates 2 routines in 0.070 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.922s -user 0m1.810s -sys 0m0.099s +real 0m2.256s +user 0m0.807s +sys 0m0.146s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h index c5e79dc1b1..76849a871e 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_MSSM_SLHA2.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index b88ebd5b4a..4790c980b3 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -357,7 +421,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,7 +435,7 @@ namespace mg5amcCpu unsigned int channelId = gpu_channelId( allChannelIds ); #endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -1059,7 +1123,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1089,11 +1152,12 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + numerators_sv[i] = fptype_sv{ 0 }; denominators_sv = fptype_sv{ 0 }; #endif } @@ -1168,38 +1232,7 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } + unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1378,11 +1411,12 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) From b029a31da674a30c86b5ebe51a05ad286f71bf8c Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Thu, 20 Nov 2025 12:46:29 +0100 Subject: [PATCH 06/18] add UMAMI interface, first compiling version of diagram sampling for SIMD --- .../gpu/MatrixElementKernels.cc | 2 +- .../iolibs/template_files/gpu/cudacpp.mk | 4 +- .../gpu/process_function_definitions.inc | 5 + .../iolibs/template_files/gpu/process_h.inc | 3 + .../gpu/process_sigmaKin_function.inc | 152 ++++-- .../iolibs/template_files/gpu/umami.cc | 467 ++++++++++++++++++ .../iolibs/template_files/gpu/umami.h | 209 ++++++++ .../PLUGIN/CUDACPP_SA_OUTPUT/output.py | 2 + 8 files changed, 799 insertions(+), 45 deletions(-) create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index 5ede45b123..480e0048b0 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), nullptr, m_numerators.data(), m_denominators.data(), true, m_selhel.data(), m_selcol.data(), nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 22acd3abe9..b13b6e9edc 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 525ab3a34c..9b993bb1d0 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -728,6 +728,11 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrndchannel, // input: random numbers[nevt] for channel sampling + unsigned int* allChannelIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc index 7de8886b1d..d21bdc898d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc @@ -110,6 +110,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrndchannel, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -117,6 +118,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allChannelIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index d47ee7da4d..25e4ebbe27 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -48,9 +48,23 @@ #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s +/*#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + unsigned int channelId; + if (allrndchannel != nullptr) { + // temporarily set this to one if a random number for diagram sampling is + // provided so that calculate_wavefunctions fills the allNumerators and + // allDenominators arrays + channelId = 1; + } else { + channelId = getChannelId( allChannelIds ); + } + +} +#endif*/ for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; @@ -70,6 +84,21 @@ // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of channel + if ( allrndchannel != nullptr ) { + fptype numerator_sum = 0.; + for( int ichan = 0; ichan < ndiagrams; ichan++ ) + { + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrndchannel[ievt] < numerator_sum / allDenominators[ievt] ) + { + channelId = ichan + 1; + break; + } + } + allChannelIdsOut[ievt] = channelId; + } + // Event-by-event random choice of color #402 gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif @@ -111,7 +140,15 @@ const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); + unsigned int channelId; + if (allrndchannel != nullptr) { + // temporarily set this to one if a random number for diagram sampling is + // provided so that calculate_wavefunctions fills the allNumerators and + // allDenominators arrays + channelId = 1; + } else { + channelId = getChannelId( allChannelIds, ievt00 ); + } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -174,52 +211,79 @@ } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // this is a workaround to be able to have different channel ids withing the same + // SIMD vector. This should be probably set up in a cleaner way where channelId is + // a vector everywhere + unsigned int channelIdVec[neppV]; + + // Event-by-event random choice of channel + if ( allrndchannel != nullptr ) { + for( int ieppV = 0; ieppV < neppV; ++ieppV ) + { + const int ievt = ievt00 + ieppV; + fptype numerator_sum = 0.; + for( int ichan = 0; ichan < processConfig::ndiagrams; ichan++ ) + { + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrndchannel[ievt] < numerator_sum / allDenominators[ievt] ) + { + channelIdVec[ieppV] = ichan + 1; + break; + } + } + allChannelIdsOut[ievt] = channelIdVec[ieppV]; + } + } else { + for( int ieppV = 0; ieppV < neppV; ++ieppV ) channelIdVec[ieppV] = channelId; + } + // Event-by-event random choice of color #402 if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) + for( int ieppV = 0; ieppV < neppV; ++ieppV ) { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } + channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype_sv targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = fptype_sv{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } + fptype_sv targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype_sv{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + } #endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { const int ievt = ievt00 + ieppV; //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) @@ -277,6 +341,10 @@ #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); + /*if( mulChannelWeight && allChannelIds != nullptr ) { + allMEs[ievt] *= allNumerators[ievt * processConfig::ndiagrams + channelId - 1] + / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + }*/ #else gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif @@ -288,7 +356,7 @@ fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc new file mode 100644 index 0000000000..6d82a9ede0 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc @@ -0,0 +1,467 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" +#include "GpuRuntime.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace { + +void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, + fptype* numerators, + fptype* denominators, + std::size_t count +) { + bool is_good_hel[CPPProcess::ncomb]; +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = count / n_threads; + bool *is_good_hel_device; + checkGpu(gpuMalloc(&is_good_hel_device, CPPProcess::ncomb)); + sigmaKin_getGoodHel<<>>( + momenta, couplings, matrix_elements, numerators, denominators, is_good_hel_device + ); + checkGpu(gpuPeekAtLastError()); + checkGpu(gpuMemcpy( + is_good_hel, is_good_hel_device, sizeof(is_good_hel), gpuMemcpyDefault + )); +#else // MGONGPUCPP_GPUIMPL + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, is_good_hel, count + ); +#endif // MGONGPUCPP_GPUIMPL + sigmaKin_setGoodHel(is_good_hel); + return nullptr; +} + +void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, + fptype* numerators, + fptype* denominators, + std::size_t count +) { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( + momenta, couplings, matrix_elements, numerators, denominators, count + ); +} + +#ifdef MGONGPUCPP_GPUIMPL +__device__ +#endif +void transpose_momenta( + const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride +) { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for (std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part) { + for(std::size_t i_mom = 0; i_mom < 4; ++i_mom) { + momenta_out[ + i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector + ] = momenta_in[ + stride * (CPPProcess::npar * i_mom + i_part) + i_event + ]; + } + } +} + +#ifdef MGONGPUCPP_GPUIMPL + +__global__ void copy_inputs( + const double* momenta_in, + const double* diagram_random_in, + const double* helicity_random_in, + const double* color_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* diagram_random, + fptype* helicity_random, + fptype* color_random, + fptype* g_s, + unsigned int* channel_index, + std::size_t count, + std::size_t stride +) { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + channel_index[i_event] = 2; + if (i_event >= count) return; + + transpose_momenta(momenta_in, momenta, i_event, stride); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt(4 * M_PI * alpha_s_in[i_event]) : 1.2177157847767195; +} + +__global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride +) { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if (i_event >= count) return; + + if (m2_out) m2_out[i_event] = matrix_elements[i_event]; + if (amp2_out) { + double denominator = denominators[i_event]; + for (std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag) { + amp2_out[stride * i_diag + i_event] = numerators[ + i_event * CPPProcess::ndiagrams + i_diag + ] / denominator; + } + } + if (diagram_out) diagram_out[i_event] = 0; + if (color_out) color_out[i_event] = color_index[i_event] - 1; + if (helicity_out) helicity_out[i_event] = helicity_index[i_event] - 1; +} + +#endif // MGONGPUCPP_GPUIMPL + +} + +extern "C" { + +UmamiStatus umami_get_meta(UmamiMetaKey meta_key, void* result) { + switch (meta_key) { + case UMAMI_META_DEVICE: { + UmamiDevice& device = *static_cast(result); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined(__HIPCC__) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } case UMAMI_META_PARTICLE_COUNT: + *static_cast(result) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast(result) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast(result) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; +} + + +UmamiStatus umami_initialize(UmamiHandle* handle, const char* param_card_path) { + CPPProcess process; + process.initProc(param_card_path); + // We don't actually need the CPPProcess instance for anything as it initializes a + // global variable. So here we just return a boolean that is used to store whether + // the good helicities are initialized + *handle = new bool(false); + return UMAMI_SUCCESS; + +} + + +UmamiStatus umami_set_parameter( + UmamiHandle handle, + const char* name, + double parameter_real, + double parameter_imag +) { + return UMAMI_ERROR_NOT_IMPLEMENTED; +} + + +UmamiStatus umami_get_parameter( + UmamiHandle handle, + const char* name, + double* parameter_real, + double* parameter_imag +) { + return UMAMI_ERROR_NOT_IMPLEMENTED; +} + + +UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + const UmamiInputKey* input_keys, + const void** inputs, + size_t output_count, + const UmamiOutputKey* output_keys, + void** outputs +) { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; +#ifdef MGONGPUCPP_GPUIMPL + const gpuStream_t gpu_stream = nullptr; +#endif + + for (std::size_t i = 0; i < input_count; ++i) { + const void* input = inputs[i]; + switch (input_keys[i]) { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast(input); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast(input); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast(input); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast(input); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast(input); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast(input); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast(input); + break; + case UMAMI_IN_GPU_STREAM: +#ifdef MGONGPUCPP_GPUIMPL + gpu_stream = static_cast(input); + break; +#else + return UMAMI_ERROR_UNSUPPORTED_INPUT; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if (!momenta_in) return UMAMI_ERROR_MISSING_INPUT; + + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for (std::size_t i = 0; i < output_count; ++i) { + void* output = outputs[i]; + switch (output_keys[i]) { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast(output); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast(output); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast(output); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast(output); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast(output); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = (count + n_threads - 1) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random; + fptype *matrix_elements, *numerators, *denominators; + int *helicity_index, *color_index; + unsigned int *channel_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync(&momenta, rounded_count * CPPProcess::npar * 4 * sizeof(fptype), gpu_stream); + gpuMallocAsync(&couplings, rounded_count * n_coup * 2 * sizeof(fptype), gpu_stream); + gpuMallocAsync(&g_s, rounded_count * sizeof(fptype), gpu_stream); + gpuMallocAsync(&helicity_random, rounded_count * sizeof(fptype), gpu_stream); + gpuMallocAsync(&color_random, rounded_count * sizeof(fptype), gpu_stream); + gpuMallocAsync(&matrix_elements, rounded_count * sizeof(fptype), gpu_stream); + gpuMallocAsync(&channel_index, rounded_count * sizeof(unsigned int), gpu_stream); + gpuMallocAsync(&numerators, rounded_count * CPPProcess::ndiagrams * sizeof(fptype), gpu_stream); + gpuMallocAsync(&denominators, rounded_count * sizeof(fptype), gpu_stream); + gpuMallocAsync(&helicity_index, rounded_count * sizeof(int), gpu_stream); + gpuMallocAsync(&color_index, rounded_count * sizeof(int), gpu_stream); + + copy_inputs<<>>( + momenta_in, random_in, alpha_s_in, + momenta, helicity_random, color_random, g_s, channel_index, + count, stride + ); + computeDependentCouplings<<>>(g_s, couplings); + checkGpu(gpuPeekAtLastError()); + + bool& is_initialized = *static_cast(handle); + if (!is_initialized) { + gpuStreamSynchronize(gpu_stream); + initialize( + momenta, couplings, matrix_elements, numerators, denominators, rounded_count + ); + is_initialized = true; + } + + sigmaKin<<>>( + momenta, + couplings, + helicity_random, + color_random, + matrix_elements, + nullptr, + channel_index, + numerators, + denominators, + false, + helicity_index, + color_index + ); + copy_outputs<<>>( + denominators, numerators, matrix_elements, color_index, helicity_index, + m2_out, amp2_out, diagram_out, color_out, helicity_out, + count, stride + ); + checkGpu(gpuPeekAtLastError()); + + gpuFreeAsync(momenta, gpu_stream); + gpuFreeAsync(couplings, gpu_stream); + gpuFreeAsync(g_s, gpu_stream); + gpuFreeAsync(helicity_random, gpu_stream); + gpuFreeAsync(color_random, gpu_stream); + gpuFreeAsync(matrix_elements, gpu_stream); + gpuFreeAsync(channel_index, gpu_stream); + gpuFreeAsync(numerators, gpu_stream); + gpuFreeAsync(denominators, gpu_stream); + gpuFreeAsync(helicity_index, gpu_stream); + gpuFreeAsync(color_index, gpu_stream); + +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = (count + page_size2 - 1) / page_size2 * page_size2; + + HostBufferBase momenta(rounded_count * CPPProcess::npar * 4); + HostBufferBase couplings( + rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 + ); + HostBufferBase g_s(rounded_count); + HostBufferBase helicity_random(rounded_count); + HostBufferBase color_random(rounded_count); + HostBufferBase diagram_random(rounded_count); + HostBufferBase matrix_elements(rounded_count); + HostBufferBase channel_index(rounded_count); + HostBufferBase numerators(rounded_count * CPPProcess::ndiagrams); + HostBufferBase denominators(rounded_count); + HostBufferBase helicity_index(rounded_count); + HostBufferBase color_index(rounded_count); + + for (std::size_t i_event = 0; i_event < rounded_count; ++i_event) { + channel_index[i_event] = 2; + } + for (std::size_t i_event = 0; i_event < count; ++i_event) { + transpose_momenta(momenta_in, momenta.data(), i_event, stride); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt(4 * M_PI * alpha_s_in[i_event]) : 1.2177157847767195; + } + computeDependentCouplings( + g_s.data(), couplings.data(), rounded_count + ); + + bool& is_initialized = *static_cast(handle); + if (!is_initialized) { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count + ); + is_initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + matrix_elements.data(), + nullptr, + diagram_random.data(), + channel_index.data(), + numerators.data(), + denominators.data(), + false, + helicity_index.data(), + color_index.data(), + rounded_count + ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for (std::size_t i_event = 0; i_event < count; ++i_event) { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + m2_out[i_event] = matrix_elements[i_event]; + for (std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag) { + amp2_out[stride * i_diag + i_event] = numerators[ + i_page * page_size * CPPProcess::ndiagrams + + i_diag * page_size + i_vector + ] / denominator; + } + diagram_out[i_event] = 0; + color_out[i_event] = color_index[i_event] - 1; + helicity_out[i_event] = helicity_index[i_event] - 1; + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; +} + + +UmamiStatus umami_free(UmamiHandle handle) { + delete static_cast(handle); + return UMAMI_SUCCESS; +} + +} diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h new file mode 100644 index 0000000000..6d1780ad22 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h @@ -0,0 +1,209 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ +const inline int UMAMI_MAJOR_VERSION = 1; +/** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ +const inline int UMAMI_MINOR_VERSION = 0; + +typedef enum { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, +} UmamiStatus; + +typedef enum { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, +} UmamiDevice; + +typedef enum { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, +} UmamiMetaKey; + +typedef enum { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + UMAMI_IN_GPU_STREAM, +} UmamiInputKey; + +typedef enum { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME +} UmamiOutputKey; + +typedef void* UmamiHandle; + + +/** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ +UmamiStatus umami_get_meta(UmamiMetaKey meta_key, void* result); + +/** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ +UmamiStatus umami_initialize(UmamiHandle* handle, const char* param_card_path); + +/** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ +UmamiStatus umami_set_parameter( + UmamiHandle handle, + const char* name, + double parameter_real, + double parameter_imag +); + +/** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ +UmamiStatus umami_get_parameter( + UmamiHandle handle, + const char* name, + double* parameter_real, + double* parameter_imag +); + +/** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ +UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + const UmamiInputKey* input_keys, + const void** inputs, + size_t output_count, + const UmamiOutputKey* output_keys, + void** outputs +); + +/** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ +UmamiStatus umami_free(UmamiHandle handle); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index f2d7189ddd..6669d53123 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -123,6 +123,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): s+'gpu/testmisc.cc', s+'gpu/testxxx_cc_ref.txt', s+'gpu/valgrind.h', s+'gpu/perf.py', s+'gpu/profile.sh', s+'gpu/cudacpp_overlay.mk', s+'gpu/makefile_wrapper.mk', + s+'gpu/umami.h', s+'gpu/umami.cc', s+'CMake/SubProcesses/CMakeLists.txt'], 'test': [s+'gpu/cudacpp_test.mk']} @@ -151,6 +152,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): 'testxxx.cc', # this is generated from a template in Subprocesses but we still link it in P1 'MemoryBuffers.h', # this is generated from a template in Subprocesses but we still link it in P1 'MemoryAccessCouplings.h', # this is generated from a template in Subprocesses but we still link it in P1 + 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] # AV - use template files from PLUGINDIR instead of MG5DIR and change their names From e21687d8dea5b57433d4f6165ffdb5ae83f32e03 Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Sun, 23 Nov 2025 20:45:18 +0100 Subject: [PATCH 07/18] debug new interface on CPU --- .../gpu/process_function_definitions.inc | 3 +- .../gpu/process_sigmaKin_function.inc | 39 +++-- .../iolibs/template_files/gpu/umami.cc | 147 +++++++++++------- .../iolibs/template_files/gpu/umami.h | 14 +- 4 files changed, 130 insertions(+), 73 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 9b993bb1d0..093a993cc7 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -728,7 +728,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - const fptype* allrndchannel, // input: random numbers[nevt] for channel sampling + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling unsigned int* allChannelIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities @@ -742,6 +742,7 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* ghelAllDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index 25e4ebbe27..4c9f400cd3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -54,7 +54,7 @@ // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s /*#ifdef MGONGPU_SUPPORTS_MULTICHANNEL unsigned int channelId; - if (allrndchannel != nullptr) { + if (allrnddiagram != nullptr) { // temporarily set this to one if a random number for diagram sampling is // provided so that calculate_wavefunctions fills the allNumerators and // allDenominators arrays @@ -85,18 +85,18 @@ gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of channel - if ( allrndchannel != nullptr ) { + if ( allrnddiagram != nullptr ) { fptype numerator_sum = 0.; for( int ichan = 0; ichan < ndiagrams; ichan++ ) { numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; - if( allrndchannel[ievt] < numerator_sum / allDenominators[ievt] ) + if( allrnddiagram[ievt] < numerator_sum / allDenominators[ievt] ) { channelId = ichan + 1; break; } } - allChannelIdsOut[ievt] = channelId; + allDiagramIdsOut[ievt] = channelId; } // Event-by-event random choice of color #402 @@ -141,7 +141,7 @@ #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL unsigned int channelId; - if (allrndchannel != nullptr) { + if (allrnddiagram != nullptr) { // temporarily set this to one if a random number for diagram sampling is // provided so that calculate_wavefunctions fills the allNumerators and // allDenominators arrays @@ -215,27 +215,40 @@ // this is a workaround to be able to have different channel ids withing the same // SIMD vector. This should be probably set up in a cleaner way where channelId is // a vector everywhere - unsigned int channelIdVec[neppV]; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; // Event-by-event random choice of channel - if ( allrndchannel != nullptr ) { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + if ( allrnddiagram != nullptr ) { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; + fptype numerator_sum_all = 0.; + for( int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + numerator_sum_all += allNumerators[ + ievt * processConfig::ndiagrams + mgOnGpu::channel2iconfig[ichan]]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; fptype numerator_sum = 0.; - for( int ichan = 0; ichan < processConfig::ndiagrams; ichan++ ) + for( int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { - numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; - if( allrndchannel[ievt] < numerator_sum / allDenominators[ievt] ) + numerator_sum += allNumerators[ + ievt * processConfig::ndiagrams + mgOnGpu::channel2iconfig[ichan]]; + if( allrnddiagram[ievt] < numerator_sum / numerator_sum_all ) { channelIdVec[ieppV] = ichan + 1; break; } } - allChannelIdsOut[ievt] = channelIdVec[ieppV]; + allDiagramIdsOut[ievt] = mgOnGpu::channel2iconfig[channelIdVec[ieppV] - 1]; } } else { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) channelIdVec[ieppV] = channelId; + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) channelIdVec[ieppV] = channelId; } // Event-by-event random choice of color #402 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc index 6d82a9ede0..5e1f1e535b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc @@ -85,28 +85,33 @@ void transpose_momenta( __global__ void copy_inputs( const double* momenta_in, - const double* diagram_random_in, const double* helicity_random_in, const double* color_random_in, + const double* diagram_random_in, const double* alpha_s_in, fptype* momenta, - fptype* diagram_random, fptype* helicity_random, fptype* color_random, + fptype* diagram_random, fptype* g_s, - unsigned int* channel_index, + unsigned int* diagram_index, std::size_t count, - std::size_t stride + std::size_t stride, + std::size_t offset ) { std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; - channel_index[i_event] = 2; + diagram_index[i_event] = 2; if (i_event >= count) return; - transpose_momenta(momenta_in, momenta, i_event, stride); - diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event] : 0.5; - helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event] : 0.5; - color_random[i_event] = color_random_in ? color_random_in[i_event] : 0.5; - g_s[i_event] = alpha_s_in ? sqrt(4 * M_PI * alpha_s_in[i_event]) : 1.2177157847767195; + transpose_momenta(&momenta_in[offset], momenta, i_event, stride); + diagram_random[i_event] = diagram_random_in ? + diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? + helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? + color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? + sqrt(4 * M_PI * alpha_s_in[i_event + offset]) : 1.2177157847767195; } __global__ void copy_outputs( @@ -121,23 +126,24 @@ __global__ void copy_outputs( int* color_out, int* helicity_out, std::size_t count, - std::size_t stride + std::size_t stride, + std::size_t offset ) { std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; if (i_event >= count) return; - if (m2_out) m2_out[i_event] = matrix_elements[i_event]; + if (m2_out) m2_out[i_event + offset] = matrix_elements[i_event]; if (amp2_out) { double denominator = denominators[i_event]; for (std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag) { - amp2_out[stride * i_diag + i_event] = numerators[ + amp2_out[stride * i_diag + i_event + offset] = numerators[ i_event * CPPProcess::ndiagrams + i_diag ] / denominator; } } - if (diagram_out) diagram_out[i_event] = 0; - if (color_out) color_out[i_event] = color_index[i_event] - 1; - if (helicity_out) helicity_out[i_event] = helicity_index[i_event] - 1; + if (diagram_out) diagram_out[i_event + offset] = 0; + if (color_out) color_out[i_event + offset] = color_index[i_event] - 1; + if (helicity_out) helicity_out[i_event + offset] = helicity_index[i_event] - 1; } #endif // MGONGPUCPP_GPUIMPL @@ -178,7 +184,7 @@ UmamiStatus umami_get_meta(UmamiMetaKey meta_key, void* result) { } -UmamiStatus umami_initialize(UmamiHandle* handle, const char* param_card_path) { +UmamiStatus umami_initialize(UmamiHandle* handle, char const* param_card_path) { CPPProcess process; process.initProc(param_card_path); // We don't actually need the CPPProcess instance for anything as it initializes a @@ -192,7 +198,7 @@ UmamiStatus umami_initialize(UmamiHandle* handle, const char* param_card_path) { UmamiStatus umami_set_parameter( UmamiHandle handle, - const char* name, + char const* name, double parameter_real, double parameter_imag ) { @@ -202,7 +208,7 @@ UmamiStatus umami_set_parameter( UmamiStatus umami_get_parameter( UmamiHandle handle, - const char* name, + char const* name, double* parameter_real, double* parameter_imag ) { @@ -216,11 +222,11 @@ UmamiStatus umami_matrix_element( size_t stride, size_t offset, size_t input_count, - const UmamiInputKey* input_keys, - const void** inputs, + UmamiInputKey const* input_keys, + void const* const* inputs, size_t output_count, - const UmamiOutputKey* output_keys, - void** outputs + UmamiOutputKey const* output_keys, + void* const* outputs ) { const double* momenta_in = nullptr; const double* alpha_s_in = nullptr; @@ -308,7 +314,7 @@ UmamiStatus umami_matrix_element( fptype *momenta, *couplings, *g_s, *helicity_random, *color_random; fptype *matrix_elements, *numerators, *denominators; int *helicity_index, *color_index; - unsigned int *channel_index; + unsigned int *diagram_index; std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; gpuMallocAsync(&momenta, rounded_count * CPPProcess::npar * 4 * sizeof(fptype), gpu_stream); @@ -316,17 +322,29 @@ UmamiStatus umami_matrix_element( gpuMallocAsync(&g_s, rounded_count * sizeof(fptype), gpu_stream); gpuMallocAsync(&helicity_random, rounded_count * sizeof(fptype), gpu_stream); gpuMallocAsync(&color_random, rounded_count * sizeof(fptype), gpu_stream); + gpuMallocAsync(&diagram_random, rounded_count * sizeof(fptype), gpu_stream); gpuMallocAsync(&matrix_elements, rounded_count * sizeof(fptype), gpu_stream); - gpuMallocAsync(&channel_index, rounded_count * sizeof(unsigned int), gpu_stream); + gpuMallocAsync(&diagram_index, rounded_count * sizeof(unsigned int), gpu_stream); gpuMallocAsync(&numerators, rounded_count * CPPProcess::ndiagrams * sizeof(fptype), gpu_stream); gpuMallocAsync(&denominators, rounded_count * sizeof(fptype), gpu_stream); gpuMallocAsync(&helicity_index, rounded_count * sizeof(int), gpu_stream); gpuMallocAsync(&color_index, rounded_count * sizeof(int), gpu_stream); copy_inputs<<>>( - momenta_in, random_in, alpha_s_in, - momenta, helicity_random, color_random, g_s, channel_index, - count, stride + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + diagram_index, + count, + stride, + offset ); computeDependentCouplings<<>>(g_s, couplings); checkGpu(gpuPeekAtLastError()); @@ -347,7 +365,8 @@ UmamiStatus umami_matrix_element( color_random, matrix_elements, nullptr, - channel_index, + diagram_random, + diagram_index, numerators, denominators, false, @@ -355,9 +374,19 @@ UmamiStatus umami_matrix_element( color_index ); copy_outputs<<>>( - denominators, numerators, matrix_elements, color_index, helicity_index, - m2_out, amp2_out, diagram_out, color_out, helicity_out, - count, stride + denominators, + numerators, + matrix_elements, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); checkGpu(gpuPeekAtLastError()); @@ -367,7 +396,7 @@ UmamiStatus umami_matrix_element( gpuFreeAsync(helicity_random, gpu_stream); gpuFreeAsync(color_random, gpu_stream); gpuFreeAsync(matrix_elements, gpu_stream); - gpuFreeAsync(channel_index, gpu_stream); + gpuFreeAsync(diagram_index, gpu_stream); gpuFreeAsync(numerators, gpu_stream); gpuFreeAsync(denominators, gpu_stream); gpuFreeAsync(helicity_index, gpu_stream); @@ -387,21 +416,25 @@ UmamiStatus umami_matrix_element( HostBufferBase color_random(rounded_count); HostBufferBase diagram_random(rounded_count); HostBufferBase matrix_elements(rounded_count); - HostBufferBase channel_index(rounded_count); + HostBufferBase diagram_index(rounded_count); HostBufferBase numerators(rounded_count * CPPProcess::ndiagrams); HostBufferBase denominators(rounded_count); HostBufferBase helicity_index(rounded_count); HostBufferBase color_index(rounded_count); - for (std::size_t i_event = 0; i_event < rounded_count; ++i_event) { - channel_index[i_event] = 2; - } + /*for (std::size_t i_event = 0; i_event < rounded_count; ++i_event) { + diagram_index[i_event] = 2; + }*/ for (std::size_t i_event = 0; i_event < count; ++i_event) { - transpose_momenta(momenta_in, momenta.data(), i_event, stride); - helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event] : 0.5; - color_random[i_event] = random_color_in ? random_color_in[i_event] : 0.5; - diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event] : 0.5; - g_s[i_event] = alpha_s_in ? sqrt(4 * M_PI * alpha_s_in[i_event]) : 1.2177157847767195; + transpose_momenta(&momenta_in[offset], momenta.data(), i_event, stride); + helicity_random[i_event] = random_helicity_in ? + random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? + random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? + random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? + sqrt(4 * M_PI * alpha_s_in[i_event + offset]) : 1.2177157847767195; } computeDependentCouplings( g_s.data(), couplings.data(), rounded_count @@ -428,7 +461,7 @@ UmamiStatus umami_matrix_element( matrix_elements.data(), nullptr, diagram_random.data(), - channel_index.data(), + diagram_index.data(), numerators.data(), denominators.data(), false, @@ -443,16 +476,26 @@ UmamiStatus umami_matrix_element( std::size_t i_vector = i_event % page_size; double denominator = denominators[i_event]; - m2_out[i_event] = matrix_elements[i_event]; - for (std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag) { - amp2_out[stride * i_diag + i_event] = numerators[ - i_page * page_size * CPPProcess::ndiagrams + - i_diag * page_size + i_vector - ] / denominator; + if (m2_out != nullptr) { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if (amp2_out != nullptr) { + for (std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag) { + amp2_out[stride * i_diag + i_event + offset] = numerators[ + i_page * page_size * CPPProcess::ndiagrams + + i_diag * page_size + i_vector + ] / denominator; + } + } + if (diagram_out != nullptr) { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if (color_out != nullptr) { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if (helicity_out != nullptr) { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; } - diagram_out[i_event] = 0; - color_out[i_event] = color_index[i_event] - 1; - helicity_out[i_event] = helicity_index[i_event] - 1; } #endif // MGONGPUCPP_GPUIMPL return UMAMI_SUCCESS; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h index 6d1780ad22..eb801f96d3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h @@ -106,7 +106,7 @@ UmamiStatus umami_get_meta(UmamiMetaKey meta_key, void* result); * @return * UMAMI_SUCCESS on success, error code otherwise */ -UmamiStatus umami_initialize(UmamiHandle* handle, const char* param_card_path); +UmamiStatus umami_initialize(UmamiHandle* handle, char const* param_card_path); /** * Sets the value of a model parameter @@ -124,7 +124,7 @@ UmamiStatus umami_initialize(UmamiHandle* handle, const char* param_card_path); */ UmamiStatus umami_set_parameter( UmamiHandle handle, - const char* name, + char const* name, double parameter_real, double parameter_imag ); @@ -146,7 +146,7 @@ UmamiStatus umami_set_parameter( */ UmamiStatus umami_get_parameter( UmamiHandle handle, - const char* name, + char const* name, double* parameter_real, double* parameter_imag ); @@ -187,11 +187,11 @@ UmamiStatus umami_matrix_element( size_t stride, size_t offset, size_t input_count, - const UmamiInputKey* input_keys, - const void** inputs, + UmamiInputKey const* input_keys, + void const* const* inputs, size_t output_count, - const UmamiOutputKey* output_keys, - void** outputs + UmamiOutputKey const* output_keys, + void* const* outputs ); /** From e022a37225748ba4308353a3cb2a82f15767835b Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Mon, 24 Nov 2025 12:37:10 +0100 Subject: [PATCH 08/18] fix SIMD version --- .../iolibs/template_files/gpu/MatrixElementKernels.cc | 2 +- .../gpu/process_function_definitions.inc | 10 ++++------ .../madgraph/iolibs/template_files/gpu/process_h.inc | 4 ++-- .../template_files/gpu/process_sigmaKin_function.inc | 6 +++--- .../madgraph/iolibs/template_files/gpu/umami.cc | 8 ++++---- 5 files changed, 14 insertions(+), 16 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index 480e0048b0..cde304f988 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), nullptr, m_numerators.data(), m_denominators.data(), true, m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 093a993cc7..567f4f5093 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -729,10 +729,6 @@ namespace mg5amcCpu const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling - unsigned int* allChannelIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities - bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -754,8 +750,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc index d21bdc898d..b9931576d6 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc @@ -110,7 +110,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - const fptype* allrndchannel, // input: random numbers[nevt] for channel sampling + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -118,7 +118,7 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities - unsigned int* allChannelIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index 4c9f400cd3..401860ac71 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -87,7 +87,7 @@ // Event-by-event random choice of channel if ( allrnddiagram != nullptr ) { fptype numerator_sum = 0.; - for( int ichan = 0; ichan < ndiagrams; ichan++ ) + for( unsigned int ichan = 0; ichan < ndiagrams; ichan++ ) { numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; if( allrnddiagram[ievt] < numerator_sum / allDenominators[ievt] ) @@ -228,14 +228,14 @@ { const int ievt = ievt00 + ieppV; fptype numerator_sum_all = 0.; - for( int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { numerator_sum_all += allNumerators[ ievt * processConfig::ndiagrams + mgOnGpu::channel2iconfig[ichan]]; } channelIdVec[ieppV] = mgOnGpu::nchannels; fptype numerator_sum = 0.; - for( int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { numerator_sum += allNumerators[ ievt * processConfig::ndiagrams + mgOnGpu::channel2iconfig[ichan]]; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc index 5e1f1e535b..4485e4177c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc @@ -458,15 +458,15 @@ UmamiStatus umami_matrix_element( couplings.data(), helicity_random.data(), color_random.data(), - matrix_elements.data(), nullptr, diagram_random.data(), - diagram_index.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), numerators.data(), denominators.data(), + diagram_index.data(), false, - helicity_index.data(), - color_index.data(), rounded_count ); From e0bb7b3a68df197551c518c0ab173552840f2f7c Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Mon, 24 Nov 2025 15:31:58 +0100 Subject: [PATCH 09/18] progress on GPU version --- .../gpu/MatrixElementKernels.cc | 8 +- .../gpu/process_function_definitions.inc | 13 ++- .../iolibs/template_files/gpu/process_h.inc | 3 + .../gpu/process_sigmaKin_function.inc | 8 +- .../iolibs/template_files/gpu/umami.cc | 85 +++++++++++++------ .../CUDACPP_SA_OUTPUT/model_handling.py | 3 +- 6 files changed, 80 insertions(+), 40 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index cde304f988..44eb05360f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) * CPPProcess::ndiagrams ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 567f4f5093..8eda543389 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -588,8 +588,9 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; @@ -601,12 +602,16 @@ namespace mg5amcCpu fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag) { + fptype* hAllNumerators = ghelAllNumerators + (idiag + ighel * nevt) * processConfig::ndiagrams; + totAllNumerators[ievt + idiag * processConfig::ndiagrams] += hAllNumerators[ievt]; + } + } + if (mulChannelWeight) { + allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc index b9931576d6..90ffe70624 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc @@ -86,6 +86,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -94,6 +95,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index 401860ac71..de3b6107ce 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -70,7 +70,7 @@ const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else @@ -353,11 +353,7 @@ // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); - /*if( mulChannelWeight && allChannelIds != nullptr ) { - allMEs[ievt] *= allNumerators[ievt * processConfig::ndiagrams + channelId - 1] - / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') - }*/ + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, mulChannelWeight, helcolDenominators[0] ); #else gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc index 4485e4177c..9812460c1c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc @@ -19,6 +19,9 @@ void* initialize_impl( const fptype* momenta, const fptype* couplings, fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif fptype* numerators, fptype* denominators, std::size_t count @@ -30,7 +33,7 @@ void* initialize_impl( bool *is_good_hel_device; checkGpu(gpuMalloc(&is_good_hel_device, CPPProcess::ncomb)); sigmaKin_getGoodHel<<>>( - momenta, couplings, matrix_elements, numerators, denominators, is_good_hel_device + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, is_good_hel_device ); checkGpu(gpuPeekAtLastError()); checkGpu(gpuMemcpy( @@ -49,13 +52,20 @@ void initialize( const fptype* momenta, const fptype* couplings, fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif fptype* numerators, fptype* denominators, std::size_t count ) { // static local initialization is called exactly once in a thread-safe way static void* dummy = initialize_impl( - momenta, couplings, matrix_elements, numerators, denominators, count + momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, denominators, count ); } @@ -148,6 +158,13 @@ __global__ void copy_outputs( #endif // MGONGPUCPP_GPUIMPL +struct InterfaceInstance { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif +}; + } extern "C" { @@ -187,12 +204,14 @@ UmamiStatus umami_get_meta(UmamiMetaKey meta_key, void* result) { UmamiStatus umami_initialize(UmamiHandle* handle, char const* param_card_path) { CPPProcess process; process.initProc(param_card_path); - // We don't actually need the CPPProcess instance for anything as it initializes a - // global variable. So here we just return a boolean that is used to store whether - // the good helicities are initialized - *handle = new bool(false); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for (int ihel = 0; ihel < CPPProcess::ncomb; ihel++) { + gpuStreamCreate(&instance->hel_streams[ihel]); + } +#endif return UMAMI_SUCCESS; - } @@ -325,10 +344,13 @@ UmamiStatus umami_matrix_element( gpuMallocAsync(&diagram_random, rounded_count * sizeof(fptype), gpu_stream); gpuMallocAsync(&matrix_elements, rounded_count * sizeof(fptype), gpu_stream); gpuMallocAsync(&diagram_index, rounded_count * sizeof(unsigned int), gpu_stream); - gpuMallocAsync(&numerators, rounded_count * CPPProcess::ndiagrams * sizeof(fptype), gpu_stream); + gpuMallocAsync(&color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof(fptype), gpu_stream); + gpuMallocAsync(&numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof(fptype), gpu_stream); gpuMallocAsync(&denominators, rounded_count * sizeof(fptype), gpu_stream); gpuMallocAsync(&helicity_index, rounded_count * sizeof(int), gpu_stream); gpuMallocAsync(&color_index, rounded_count * sizeof(int), gpu_stream); + gpuMallocAsync(&ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof(fptype), gpu_stream); + gpuMallocAsync(&ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof(fptype), gpu_stream); copy_inputs<<>>( momenta_in, @@ -348,30 +370,40 @@ UmamiStatus umami_matrix_element( ); computeDependentCouplings<<>>(g_s, couplings); checkGpu(gpuPeekAtLastError()); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize(gpu_stream); - bool& is_initialized = *static_cast(handle); - if (!is_initialized) { - gpuStreamSynchronize(gpu_stream); + InterfaceInstance* instance = static_cast(handle); + if (!instance->initialized) { initialize( - momenta, couplings, matrix_elements, numerators, denominators, rounded_count + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); - is_initialized = true; + instance->initialized = true; } - sigmaKin<<>>( + sigmaKin( momenta, couplings, helicity_random, color_random, - matrix_elements, nullptr, diagram_random, - diagram_index, + matrix_elements, + helicity_index, + color_index, + color_jamps, numerators, denominators, + diagram_index, false, - helicity_index, - color_index + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); copy_outputs<<>>( denominators, @@ -422,9 +454,6 @@ UmamiStatus umami_matrix_element( HostBufferBase helicity_index(rounded_count); HostBufferBase color_index(rounded_count); - /*for (std::size_t i_event = 0; i_event < rounded_count; ++i_event) { - diagram_index[i_event] = 2; - }*/ for (std::size_t i_event = 0; i_event < count; ++i_event) { transpose_momenta(&momenta_in[offset], momenta.data(), i_event, stride); helicity_random[i_event] = random_helicity_in ? @@ -440,8 +469,8 @@ UmamiStatus umami_matrix_element( g_s.data(), couplings.data(), rounded_count ); - bool& is_initialized = *static_cast(handle); - if (!is_initialized) { + InterfaceInstance* instance = static_cast(handle); + if (!instance->initialized) { initialize( momenta.data(), couplings.data(), @@ -450,7 +479,7 @@ UmamiStatus umami_matrix_element( denominators.data(), rounded_count ); - is_initialized = true; + instance->initialized = true; } sigmaKin( @@ -503,7 +532,13 @@ UmamiStatus umami_matrix_element( UmamiStatus umami_free(UmamiHandle handle) { - delete static_cast(handle); + InterfaceInstance* instance = static_cast(handle); +#ifdef MGONGPUCPP_GPUIMPL + for (int ihel = 0; ihel < CPPProcess::ncomb; ihel++) { + if (instance->hel_streams[ihel]) gpuStreamDestroy(instance->hel_streams[ihel]); + } +#endif + delete instance; return UMAMI_SUCCESS; } diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 544631373b..37b2c620a7 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -1938,7 +1938,8 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else From 6b104f18bf2e3bbe812c26d578ae21595decbcd8 Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Wed, 26 Nov 2025 13:26:24 +0100 Subject: [PATCH 10/18] debugging cuda version --- .../template_files/gpu/GpuAbstraction.h | 6 +++ .../gpu/MatrixElementKernels.cc | 2 +- .../gpu/process_function_definitions.inc | 33 +++++++++++++--- .../gpu/process_sigmaKin_function.inc | 18 +-------- .../iolibs/template_files/gpu/umami.cc | 39 +++++++++---------- .../iolibs/template_files/gpu/umami.h | 2 +- 6 files changed, 55 insertions(+), 45 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index 44eb05360f..469edd8d9e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) * CPPProcess::ndiagrams ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 8eda543389..aca1f05bba 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -656,16 +656,36 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, + const fptype* allDenominators, + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if ( allrnddiagram != nullptr ) { + fptype numerator_sum = 0.; + for( unsigned int ichan = 0; ichan < processConfig::ndiagrams; ichan++ ) + { + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / allDenominators[ievt] ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -743,7 +763,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) - unsigned int* ghelAllDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index de3b6107ce..ea7bfd55d8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -84,23 +84,9 @@ // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of channel - if ( allrnddiagram != nullptr ) { - fptype numerator_sum = 0.; - for( unsigned int ichan = 0; ichan < ndiagrams; ichan++ ) - { - numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; - if( allrnddiagram[ievt] < numerator_sum / allDenominators[ievt] ) - { - channelId = ichan + 1; - break; - } - } - allDiagramIdsOut[ievt] = channelId; - } - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc index 9812460c1c..a6a1a41ec2 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc @@ -29,16 +29,15 @@ void* initialize_impl( bool is_good_hel[CPPProcess::ncomb]; #ifdef MGONGPUCPP_GPUIMPL std::size_t n_threads = 256; - std::size_t n_blocks = count / n_threads; bool *is_good_hel_device; - checkGpu(gpuMalloc(&is_good_hel_device, CPPProcess::ncomb)); - sigmaKin_getGoodHel<<>>( - momenta, couplings, matrix_elements, color_jamps, numerators, denominators, is_good_hel_device + gpuMalloc(&is_good_hel_device, CPPProcess::ncomb); + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, color_jamps, is_good_hel_device, count ); checkGpu(gpuPeekAtLastError()); - checkGpu(gpuMemcpy( - is_good_hel, is_good_hel_device, sizeof(is_good_hel), gpuMemcpyDefault - )); + gpuMemcpy( + is_good_hel, is_good_hel_device, sizeof(is_good_hel), gpuMemcpyDeviceToHost + ); #else // MGONGPUCPP_GPUIMPL sigmaKin_getGoodHel( momenta, couplings, matrix_elements, numerators, denominators, is_good_hel, count @@ -249,14 +248,11 @@ UmamiStatus umami_matrix_element( ) { const double* momenta_in = nullptr; const double* alpha_s_in = nullptr; - const int* flavor_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused const double* random_color_in = nullptr; const double* random_helicity_in = nullptr; const double* random_diagram_in = nullptr; - const int* diagram_in = nullptr; -#ifdef MGONGPUCPP_GPUIMPL - const gpuStream_t gpu_stream = nullptr; -#endif + const int* diagram_in = nullptr; // TODO: unused for (std::size_t i = 0; i < input_count; ++i) { const void* input = inputs[i]; @@ -284,19 +280,15 @@ UmamiStatus umami_matrix_element( case UMAMI_IN_DIAGRAM_INDEX: diagram_in = static_cast(input); break; - case UMAMI_IN_GPU_STREAM: -#ifdef MGONGPUCPP_GPUIMPL - gpu_stream = static_cast(input); - break; -#else - return UMAMI_ERROR_UNSUPPORTED_INPUT; -#endif default: return UMAMI_ERROR_UNSUPPORTED_INPUT; } } if (!momenta_in) return UMAMI_ERROR_MISSING_INPUT; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif double* m2_out = nullptr; double* amp2_out = nullptr; int* diagram_out = nullptr; @@ -320,6 +312,11 @@ UmamiStatus umami_matrix_element( case UMAMI_OUT_DIAGRAM_INDEX: diagram_out = static_cast(output); break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast(output); + break; +#endif default: return UMAMI_ERROR_UNSUPPORTED_OUTPUT; } @@ -330,8 +327,8 @@ UmamiStatus umami_matrix_element( std::size_t n_blocks = (count + n_threads - 1) / n_threads; std::size_t rounded_count = n_blocks * n_threads; - fptype *momenta, *couplings, *g_s, *helicity_random, *color_random; - fptype *matrix_elements, *numerators, *denominators; + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; int *helicity_index, *color_index; unsigned int *diagram_index; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h index eb801f96d3..29327f7bf9 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h @@ -64,7 +64,6 @@ typedef enum { UMAMI_IN_RANDOM_DIAGRAM, UMAMI_IN_HELICITY_INDEX, UMAMI_IN_DIAGRAM_INDEX, - UMAMI_IN_GPU_STREAM, } UmamiInputKey; typedef enum { @@ -73,6 +72,7 @@ typedef enum { UMAMI_OUT_COLOR_INDEX, UMAMI_OUT_HELICITY_INDEX, UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, // NLO: born, virtual, poles, counterterms // color: LC-ME, FC-ME } UmamiOutputKey; From 31391f04655e09c01f32848985d220f9ba3c788f Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Wed, 26 Nov 2025 23:39:34 +0100 Subject: [PATCH 11/18] various bugfixes --- .../gpu/process_function_definitions.inc | 25 ++-- .../gpu/process_sigmaKin_function.inc | 117 +++++------------- .../iolibs/template_files/gpu/umami.cc | 28 ++--- .../CUDACPP_SA_OUTPUT/model_handling.py | 10 +- 4 files changed, 57 insertions(+), 123 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index aca1f05bba..f8a10e6335 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -448,8 +448,7 @@ namespace mg5amcCpu // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -525,8 +524,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -588,6 +586,7 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif const fptype globaldenom) /* clang-format on */ @@ -596,7 +595,7 @@ namespace mg5amcCpu allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities @@ -604,13 +603,15 @@ namespace mg5amcCpu { fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + (ievt + ighel * nevt) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag) { - fptype* hAllNumerators = ghelAllNumerators + (idiag + ighel * nevt) * processConfig::ndiagrams; - totAllNumerators[ievt + idiag * processConfig::ndiagrams] += hAllNumerators[ievt]; + firstNumerator[idiag] += hAllNumerators[idiag]; } } if (mulChannelWeight) { - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } } #endif @@ -674,16 +675,18 @@ namespace mg5amcCpu // Event-by-event random choice of channel if ( allrnddiagram != nullptr ) { fptype numerator_sum = 0.; - for( unsigned int ichan = 0; ichan < processConfig::ndiagrams; ichan++ ) + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { - numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + numerator_sum += allNumerators[ + ievt * processConfig::ndiagrams + ichan]; if( allrnddiagram[ievt] < numerator_sum / allDenominators[ievt] ) { channelId = ichan + 1; break; } } - allDiagramIdsOut[ievt] = channelId; + allDiagramIdsOut[ievt] = mgOnGpu::channel2iconfig[channelId - 1]; } if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index ea7bfd55d8..74e363691b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -17,7 +17,7 @@ gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -35,8 +35,9 @@ fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - for( int i = 0; i < processConfig::ndiagrams; ++i ) + for( int i = 0; i < processConfig::ndiagrams; ++i ) { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -52,19 +53,6 @@ // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s -/*#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId; - if (allrnddiagram != nullptr) { - // temporarily set this to one if a random number for diagram sampling is - // provided so that calculate_wavefunctions fills the allNumerators and - // allDenominators arrays - channelId = 1; - } else { - channelId = getChannelId( allChannelIds ); - } - -} -#endif*/ for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; @@ -72,7 +60,8 @@ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -83,10 +72,15 @@ // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); // Event-by-event random choice of color and diagram #402 gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -124,17 +118,6 @@ const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId; - if (allrnddiagram != nullptr) { - // temporarily set this to one if a random number for diagram sampling is - // provided so that calculate_wavefunctions fills the allNumerators and - // allDenominators arrays - channelId = 1; - } else { - channelId = getChannelId( allChannelIds, ievt00 ); - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -149,7 +132,8 @@ cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -198,34 +182,33 @@ #endif } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // this is a workaround to be able to have different channel ids withing the same - // SIMD vector. This should be probably set up in a cleaner way where channelId is - // a vector everywhere #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT const int vecsize = 2 * neppV; #else const int vecsize = neppV; #endif unsigned int channelIdVec[vecsize]; + if (allChannelIds != nullptr) { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; + } + } // Event-by-event random choice of channel if ( allrnddiagram != nullptr ) { for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - fptype numerator_sum_all = 0.; - for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) - { - numerator_sum_all += allNumerators[ - ievt * processConfig::ndiagrams + mgOnGpu::channel2iconfig[ichan]]; - } channelIdVec[ieppV] = mgOnGpu::nchannels; fptype numerator_sum = 0.; for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { numerator_sum += allNumerators[ - ievt * processConfig::ndiagrams + mgOnGpu::channel2iconfig[ichan]]; - if( allrnddiagram[ievt] < numerator_sum / numerator_sum_all ) + ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV %% neppV]; + if( allrnddiagram[ievt] < numerator_sum / allDenominators[ievt] ) { channelIdVec[ieppV] = ichan + 1; break; @@ -233,16 +216,14 @@ } allDiagramIdsOut[ievt] = mgOnGpu::channel2iconfig[channelIdVec[ieppV] - 1]; } - } else { - for( int ieppV = 0; ieppV < vecsize; ++ieppV ) channelIdVec[ieppV] = channelId; } // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - channelId = channelIdVec[ieppV]; + unsigned int channelId = channelIdVec[ieppV]; if( channelId > mgOnGpu::nchannels ) { printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); @@ -261,58 +242,28 @@ printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 } - fptype_sv targetamp[ncolor] = { 0 }; + fptype targetamp[ncolor] = { 0 }; // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; + targetamp[icolC] = 0; else targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * (ieppV / neppV)][ieppV %% neppV]; } -#endif const int ievt = ievt00 + ieppV; //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] //printf( "sigmaKin: ievt=%%d icol=%%d\n", ievt, icolC+1 ); break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt2, allrndcol[ievt2] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) - { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%%d icol=%%d\n", ievt2, icolC+1 ); - break; - } - } -#endif } } else @@ -337,13 +288,7 @@ // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, mulChannelWeight, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc index a6a1a41ec2..5dbc5d3780 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc @@ -27,22 +27,13 @@ void* initialize_impl( std::size_t count ) { bool is_good_hel[CPPProcess::ncomb]; -#ifdef MGONGPUCPP_GPUIMPL - std::size_t n_threads = 256; - bool *is_good_hel_device; - gpuMalloc(&is_good_hel_device, CPPProcess::ncomb); - sigmaKin_getGoodHel( - momenta, couplings, matrix_elements, numerators, denominators, color_jamps, is_good_hel_device, count - ); - checkGpu(gpuPeekAtLastError()); - gpuMemcpy( - is_good_hel, is_good_hel_device, sizeof(is_good_hel), gpuMemcpyDeviceToHost - ); -#else // MGONGPUCPP_GPUIMPL sigmaKin_getGoodHel( - momenta, couplings, matrix_elements, numerators, denominators, is_good_hel, count + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, count ); -#endif // MGONGPUCPP_GPUIMPL sigmaKin_setGoodHel(is_good_hel); return nullptr; } @@ -103,13 +94,11 @@ __global__ void copy_inputs( fptype* color_random, fptype* diagram_random, fptype* g_s, - unsigned int* diagram_index, std::size_t count, std::size_t stride, std::size_t offset ) { std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; - diagram_index[i_event] = 2; if (i_event >= count) return; transpose_momenta(&momenta_in[offset], momenta, i_event, stride); @@ -127,6 +116,7 @@ __global__ void copy_outputs( fptype* denominators, fptype* numerators, fptype* matrix_elements, + unsigned int* diagram_index, int* color_index, int* helicity_index, double* m2_out, @@ -150,7 +140,7 @@ __global__ void copy_outputs( ] / denominator; } } - if (diagram_out) diagram_out[i_event + offset] = 0; + if (diagram_out) diagram_out[i_event + offset] = diagram_index[i_event] - 1; if (color_out) color_out[i_event + offset] = color_index[i_event] - 1; if (helicity_out) helicity_out[i_event + offset] = helicity_index[i_event] - 1; } @@ -343,7 +333,7 @@ UmamiStatus umami_matrix_element( gpuMallocAsync(&diagram_index, rounded_count * sizeof(unsigned int), gpu_stream); gpuMallocAsync(&color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof(fptype), gpu_stream); gpuMallocAsync(&numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof(fptype), gpu_stream); - gpuMallocAsync(&denominators, rounded_count * sizeof(fptype), gpu_stream); + gpuMallocAsync(&denominators, rounded_count * CPPProcess::ncomb * sizeof(fptype), gpu_stream); gpuMallocAsync(&helicity_index, rounded_count * sizeof(int), gpu_stream); gpuMallocAsync(&color_index, rounded_count * sizeof(int), gpu_stream); gpuMallocAsync(&ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof(fptype), gpu_stream); @@ -360,7 +350,6 @@ UmamiStatus umami_matrix_element( color_random, diagram_random, g_s, - diagram_index, count, stride, offset @@ -406,6 +395,7 @@ UmamiStatus umami_matrix_element( denominators, numerators, matrix_elements, + diagram_index, color_index, helicity_index, m2_out, diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 37b2c620a7..ef2ea6baf3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -1323,7 +1323,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -1332,7 +1332,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -1961,10 +1961,6 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -1999,7 +1995,7 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % id_amp) # wrong fix for BUG #472 res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL") diagnum = diagram.get('number') - res.append("if( channelId != 0 )") + res.append("if( storeChannelWeights )") res.append("{") res.append(" numerators_sv[%i] += cxabs2( amp_sv[0] );" % (diagnum-1)) res.append(" denominators_sv += cxabs2( amp_sv[0] );") From 93f0ad739bdda7dc242046cef6e967366c80e695 Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Sun, 30 Nov 2025 19:04:33 +0100 Subject: [PATCH 12/18] return channel instead of diagram id --- .../iolibs/template_files/gpu/process_function_definitions.inc | 2 +- .../iolibs/template_files/gpu/process_sigmaKin_function.inc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index f8a10e6335..57812240d6 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -686,7 +686,7 @@ namespace mg5amcCpu break; } } - allDiagramIdsOut[ievt] = mgOnGpu::channel2iconfig[channelId - 1]; + allDiagramIdsOut[ievt] = channelId; } if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index 74e363691b..7254882e02 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -214,7 +214,7 @@ break; } } - allDiagramIdsOut[ievt] = mgOnGpu::channel2iconfig[channelIdVec[ieppV] - 1]; + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; } } From d02e2327d0561e844f76bf55ed0c302741539f10 Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Sun, 30 Nov 2025 19:12:29 +0100 Subject: [PATCH 13/18] fix memory leak --- .../madgraph/iolibs/template_files/gpu/umami.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc index 5dbc5d3780..a9e780a4e9 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc @@ -414,12 +414,16 @@ UmamiStatus umami_matrix_element( gpuFreeAsync(g_s, gpu_stream); gpuFreeAsync(helicity_random, gpu_stream); gpuFreeAsync(color_random, gpu_stream); + gpuFreeAsync(diagram_random, gpu_stream); gpuFreeAsync(matrix_elements, gpu_stream); gpuFreeAsync(diagram_index, gpu_stream); + gpuFreeAsync(color_jamps, gpu_stream); gpuFreeAsync(numerators, gpu_stream); gpuFreeAsync(denominators, gpu_stream); gpuFreeAsync(helicity_index, gpu_stream); gpuFreeAsync(color_index, gpu_stream); + gpuFreeAsync(ghel_matrix_elements, gpu_stream); + gpuFreeAsync(ghel_jamps, gpu_stream); #else // MGONGPUCPP_GPUIMPL // need to round to round to double page size for some reason From e16da70c102a912b73534901fcffcb311d0245bc Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Wed, 3 Dec 2025 10:38:45 +0200 Subject: [PATCH 14/18] only sample diagrams with valid iconfig --- .../gpu/process_function_definitions.inc | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 57812240d6..47be9ba820 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -674,13 +674,20 @@ namespace mg5amcCpu // Event-by-event random choice of channel if ( allrnddiagram != nullptr ) { - fptype numerator_sum = 0.; + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if (mgOnGpu::channel2iconfig[ichan] == -1) continue; + normalization += allNumerators[ + ievt * processConfig::ndiagrams + ichan]; + } channelId = mgOnGpu::nchannels; for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { + if (mgOnGpu::channel2iconfig[ichan] == -1) continue; numerator_sum += allNumerators[ ievt * processConfig::ndiagrams + ichan]; - if( allrnddiagram[ievt] < numerator_sum / allDenominators[ievt] ) + if( allrnddiagram[ievt] < numerator_sum / normalization ) { channelId = ichan + 1; break; From 2fc63c0c00fd3cef7a703678900b7ecc8095ce88 Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Wed, 3 Dec 2025 11:49:07 +0100 Subject: [PATCH 15/18] repeat bugfix for simd version --- .../template_files/gpu/process_sigmaKin_function.inc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index 7254882e02..1d1d935d7e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -201,14 +201,22 @@ for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if (mgOnGpu::channel2iconfig[ichan] == -1) continue; + normalization += allNumerators[ + ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV %% neppV]; + } channelIdVec[ieppV] = mgOnGpu::nchannels; - fptype numerator_sum = 0.; for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { + if (mgOnGpu::channel2iconfig[ichan] == -1) continue; numerator_sum += allNumerators[ ievt / neppV * neppV * processConfig::ndiagrams + ichan * neppV + ieppV %% neppV]; - if( allrnddiagram[ievt] < numerator_sum / allDenominators[ievt] ) + if( allrnddiagram[ievt] < numerator_sum / normalization ) { channelIdVec[ieppV] = ichan + 1; break; From 603007e583374a59fd648748b0f075e134396877 Mon Sep 17 00:00:00 2001 From: Stefan Roiser Date: Thu, 11 Dec 2025 10:50:04 +0100 Subject: [PATCH 16/18] fix formatting --- .../gpu/process_function_definitions.inc | 24 +- .../gpu/process_sigmaKin_function.inc | 24 +- .../iolibs/template_files/gpu/umami.cc | 616 +++++++++--------- .../iolibs/template_files/gpu/umami.h | 260 ++++---- 4 files changed, 465 insertions(+), 459 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 47be9ba820..9c409d4508 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -447,7 +447,7 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); @@ -603,9 +603,10 @@ namespace mg5amcCpu { fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; totAllDenominators[ievt] += hAllDenominators[ievt]; - fptype* hAllNumerators = ghelAllNumerators + (ievt + ighel * nevt) * processConfig::ndiagrams; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; - for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag) { + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag) + { firstNumerator[idiag] += hAllNumerators[idiag]; } } @@ -663,8 +664,8 @@ namespace mg5amcCpu const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const fptype* allNumerators, - const fptype* allDenominators, + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) @@ -673,20 +674,19 @@ namespace mg5amcCpu // Event-by-event random choice of color #402 // Event-by-event random choice of channel - if ( allrnddiagram != nullptr ) { + if( allrnddiagram != nullptr ) + { fptype numerator_sum = 0., normalization = 0.; for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { - if (mgOnGpu::channel2iconfig[ichan] == -1) continue; - normalization += allNumerators[ - ievt * processConfig::ndiagrams + ichan]; + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; } channelId = mgOnGpu::nchannels; for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { - if (mgOnGpu::channel2iconfig[ichan] == -1) continue; - numerator_sum += allNumerators[ - ievt * processConfig::ndiagrams + ichan]; + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; if( allrnddiagram[ievt] < numerator_sum / normalization ) { channelId = ichan + 1; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index 1d1d935d7e..a65e806d08 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -35,7 +35,8 @@ fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - for( int i = 0; i < processConfig::ndiagrams; ++i ) { + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; } denominators_sv = fptype_sv{ 0 }; @@ -187,8 +188,9 @@ #else const int vecsize = neppV; #endif - unsigned int channelIdVec[vecsize]; - if (allChannelIds != nullptr) { + unsigned int channelIdVec[vecsize]; + if (allChannelIds != nullptr) + { for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; @@ -204,18 +206,16 @@ fptype numerator_sum = 0., normalization = 0.; for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { - if (mgOnGpu::channel2iconfig[ichan] == -1) continue; - normalization += allNumerators[ - ievt / neppV * neppV * processConfig::ndiagrams + - ichan * neppV + ieppV %% neppV]; + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV %% neppV]; } channelIdVec[ieppV] = mgOnGpu::nchannels; for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { - if (mgOnGpu::channel2iconfig[ichan] == -1) continue; - numerator_sum += allNumerators[ - ievt / neppV * neppV * processConfig::ndiagrams + - ichan * neppV + ieppV %% neppV]; + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV %% neppV]; if( allrnddiagram[ievt] < numerator_sum / normalization ) { channelIdVec[ieppV] = ichan + 1; @@ -259,7 +259,7 @@ else targetamp[icolC] = targetamp[icolC - 1]; if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += - jamp2_sv[icolC + ncolor * (ieppV / neppV)][ieppV %% neppV]; + jamp2_sv[icolC + ncolor * (ieppV / neppV)][ieppV %% neppV]; } const int ievt = ievt00 + ieppV; //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc index a9e780a4e9..348597b9bb 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc @@ -1,9 +1,9 @@ #include "umami.h" #include "CPPProcess.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" -#include "GpuRuntime.h" #include @@ -13,9 +13,10 @@ using namespace mg5amcGpu; using namespace mg5amcCpu; #endif -namespace { +namespace +{ -void* initialize_impl( + void* initialize_impl( const fptype* momenta, const fptype* couplings, fptype* matrix_elements, @@ -24,21 +25,21 @@ void* initialize_impl( #endif fptype* numerators, fptype* denominators, - std::size_t count -) { + std::size_t count ) + { bool is_good_hel[CPPProcess::ncomb]; sigmaKin_getGoodHel( momenta, couplings, matrix_elements, numerators, denominators, #ifdef MGONGPUCPP_GPUIMPL color_jamps, #endif - is_good_hel, count - ); - sigmaKin_setGoodHel(is_good_hel); - return nullptr; -} + is_good_hel, + count ); + sigmaKin_setGoodHel(is_good_hel); + return nullptr; + } -void initialize( + void initialize( const fptype* momenta, const fptype* couplings, fptype* matrix_elements, @@ -47,43 +48,40 @@ void initialize( #endif fptype* numerators, fptype* denominators, - std::size_t count -) { + std::size_t count ) + { // static local initialization is called exactly once in a thread-safe way - static void* dummy = initialize_impl( - momenta, couplings, matrix_elements, + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, #ifdef MGONGPUCPP_GPUIMPL - color_jamps, + color_jamps, #endif - numerators, denominators, count + numerators, denominators, count ); } #ifdef MGONGPUCPP_GPUIMPL -__device__ + __device__ #endif -void transpose_momenta( - const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride -) { - std::size_t page_size = MemoryAccessMomentaBase::neppM; - std::size_t i_page = i_event / page_size; - std::size_t i_vector = i_event % page_size; - - for (std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part) { - for(std::size_t i_mom = 0; i_mom < 4; ++i_mom) { - momenta_out[ - i_page * CPPProcess::npar * 4 * page_size + - i_part * 4 * page_size + i_mom * page_size + i_vector - ] = momenta_in[ - stride * (CPPProcess::npar * i_mom + i_part) + i_event - ]; + void + transpose_momenta( + const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for (std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part) { + for(std::size_t i_mom = 0; i_mom < 4; ++i_mom) { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( + CPPProcess::npar * i_mom + i_part) + i_event]; } } } #ifdef MGONGPUCPP_GPUIMPL -__global__ void copy_inputs( + __global__ void copy_inputs( const double* momenta_in, const double* helicity_random_in, const double* color_random_in, @@ -96,23 +94,19 @@ __global__ void copy_inputs( fptype* g_s, std::size_t count, std::size_t stride, - std::size_t offset -) { + std::size_t offset ) + { std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; - if (i_event >= count) return; - - transpose_momenta(&momenta_in[offset], momenta, i_event, stride); - diagram_random[i_event] = diagram_random_in ? - diagram_random_in[i_event + offset] : 0.5; - helicity_random[i_event] = helicity_random_in ? - helicity_random_in[i_event + offset] : 0.5; - color_random[i_event] = color_random_in ? - color_random_in[i_event + offset] : 0.5; - g_s[i_event] = alpha_s_in ? - sqrt(4 * M_PI * alpha_s_in[i_event + offset]) : 1.2177157847767195; -} + if ( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt(4 * M_PI * alpha_s_in[i_event + offset]) : 1.2177157847767195; + } -__global__ void copy_outputs( + __global__ void copy_outputs( fptype* denominators, fptype* numerators, fptype* matrix_elements, @@ -126,105 +120,112 @@ __global__ void copy_outputs( int* helicity_out, std::size_t count, std::size_t stride, - std::size_t offset -) { + std::size_t offset ) + { std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; - if (i_event >= count) return; - - if (m2_out) m2_out[i_event + offset] = matrix_elements[i_event]; - if (amp2_out) { - double denominator = denominators[i_event]; - for (std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag) { - amp2_out[stride * i_diag + i_event + offset] = numerators[ - i_event * CPPProcess::ndiagrams + i_diag - ] / denominator; - } + if ( i_event >= count ) return; + + if ( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if ( amp2_out) + { + double denominator = denominators[i_event]; + for (std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } } - if (diagram_out) diagram_out[i_event + offset] = diagram_index[i_event] - 1; - if (color_out) color_out[i_event + offset] = color_index[i_event] - 1; - if (helicity_out) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + if ( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if ( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if ( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; } #endif // MGONGPUCPP_GPUIMPL -struct InterfaceInstance { + struct InterfaceInstance + { bool initialized = false; #ifdef MGONGPUCPP_GPUIMPL gpuStream_t hel_streams[CPPProcess::ncomb]; #endif -}; + }; } -extern "C" { +extern "C" +{ -UmamiStatus umami_get_meta(UmamiMetaKey meta_key, void* result) { - switch (meta_key) { - case UMAMI_META_DEVICE: { + UmamiStatus umami_get_meta(UmamiMetaKey meta_key, void* result) + { + switch (meta_key) + { + case UMAMI_META_DEVICE: + { UmamiDevice& device = *static_cast(result); #ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ device = UMAMI_DEVICE_CUDA; -#elif defined(__HIPCC__) +#elif defined( __HIPCC__ ) device = UMAMI_DEVICE_HIP; #endif #else device = UMAMI_DEVICE_CPU; #endif break; - } case UMAMI_META_PARTICLE_COUNT: - *static_cast(result) = CPPProcess::npar; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; break; - case UMAMI_META_DIAGRAM_COUNT: - *static_cast(result) = CPPProcess::ndiagrams; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; break; - case UMAMI_META_HELICITY_COUNT: - *static_cast(result) = CPPProcess::ncomb; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; break; - case UMAMI_META_COLOR_COUNT: + case UMAMI_META_COLOR_COUNT: return UMAMI_ERROR_UNSUPPORTED_META; - default: + default: return UMAMI_ERROR_UNSUPPORTED_META; } return UMAMI_SUCCESS; } -UmamiStatus umami_initialize(UmamiHandle* handle, char const* param_card_path) { + UmamiStatus umami_initialize(UmamiHandle* handle, char const* param_card_path) + { CPPProcess process; - process.initProc(param_card_path); + process.initProc( param_card_path ); auto instance = new InterfaceInstance(); *handle = instance; #ifdef MGONGPUCPP_GPUIMPL - for (int ihel = 0; ihel < CPPProcess::ncomb; ihel++) { - gpuStreamCreate(&instance->hel_streams[ihel]); + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) { + gpuStreamCreate(&instance->hel_streams[ihel]); } #endif return UMAMI_SUCCESS; -} + } -UmamiStatus umami_set_parameter( + UmamiStatus umami_set_parameter( UmamiHandle handle, char const* name, double parameter_real, - double parameter_imag -) { + double parameter_imag ) + { return UMAMI_ERROR_NOT_IMPLEMENTED; -} + } -UmamiStatus umami_get_parameter( + UmamiStatus umami_get_parameter( UmamiHandle handle, char const* name, double* parameter_real, - double* parameter_imag -) { + double* parameter_imag) + { return UMAMI_ERROR_NOT_IMPLEMENTED; -} + } -UmamiStatus umami_matrix_element( + UmamiStatus umami_matrix_element( UmamiHandle handle, size_t count, size_t stride, @@ -234,8 +235,8 @@ UmamiStatus umami_matrix_element( void const* const* inputs, size_t output_count, UmamiOutputKey const* output_keys, - void* const* outputs -) { + void* const* outputs ) +{ const double* momenta_in = nullptr; const double* alpha_s_in = nullptr; const int* flavor_in = nullptr; // TODO: unused @@ -244,37 +245,39 @@ UmamiStatus umami_matrix_element( const double* random_diagram_in = nullptr; const int* diagram_in = nullptr; // TODO: unused - for (std::size_t i = 0; i < input_count; ++i) { - const void* input = inputs[i]; - switch (input_keys[i]) { + for (std::size_t i = 0; i < input_count; ++i) + { + const void* input = inputs[i]; + switch (input_keys[i]) + { case UMAMI_IN_MOMENTA: - momenta_in = static_cast(input); - break; + momenta_in = static_cast(input); + break; case UMAMI_IN_ALPHA_S: - alpha_s_in = static_cast(input); - break; + alpha_s_in = static_cast(input); + break; case UMAMI_IN_FLAVOR_INDEX: - flavor_in = static_cast(input); - break; + flavor_in = static_cast(input); + break; case UMAMI_IN_RANDOM_COLOR: - random_color_in = static_cast(input); - break; + random_color_in = static_cast(input); + break; case UMAMI_IN_RANDOM_HELICITY: - random_helicity_in = static_cast(input); - break; + random_helicity_in = static_cast(input); + break; case UMAMI_IN_RANDOM_DIAGRAM: - random_diagram_in = static_cast(input); - break; + random_diagram_in = static_cast(input); + break; case UMAMI_IN_HELICITY_INDEX: - return UMAMI_ERROR_UNSUPPORTED_INPUT; + return UMAMI_ERROR_UNSUPPORTED_INPUT; case UMAMI_IN_DIAGRAM_INDEX: - diagram_in = static_cast(input); - break; + diagram_in = static_cast(input); + break; default: - return UMAMI_ERROR_UNSUPPORTED_INPUT; - } + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } } - if (!momenta_in) return UMAMI_ERROR_MISSING_INPUT; + if ( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; #ifdef MGONGPUCPP_GPUIMPL gpuStream_t gpu_stream = nullptr; @@ -284,253 +287,252 @@ UmamiStatus umami_matrix_element( int* diagram_out = nullptr; int* color_out = nullptr; int* helicity_out = nullptr; - for (std::size_t i = 0; i < output_count; ++i) { - void* output = outputs[i]; - switch (output_keys[i]) { + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch (output_keys[i]) + { case UMAMI_OUT_MATRIX_ELEMENT: - m2_out = static_cast(output); - break; + m2_out = static_cast(output); + break; case UMAMI_OUT_DIAGRAM_AMP2: - amp2_out = static_cast(output); - break; + amp2_out = static_cast(output); + break; case UMAMI_OUT_COLOR_INDEX: - color_out = static_cast(output); - break; + color_out = static_cast(output); + break; case UMAMI_OUT_HELICITY_INDEX: - helicity_out = static_cast(output); - break; + helicity_out = static_cast(output); + break; case UMAMI_OUT_DIAGRAM_INDEX: - diagram_out = static_cast(output); - break; + diagram_out = static_cast(output); + break; #ifdef MGONGPUCPP_GPUIMPL case UMAMI_OUT_GPU_STREAM: - gpu_stream = static_cast(output); - break; + gpu_stream = static_cast(output); + break; #endif default: - return UMAMI_ERROR_UNSUPPORTED_OUTPUT; - } + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } } #ifdef MGONGPUCPP_GPUIMPL std::size_t n_threads = 256; - std::size_t n_blocks = (count + n_threads - 1) / n_threads; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; std::size_t rounded_count = n_blocks * n_threads; fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; int *helicity_index, *color_index; - unsigned int *diagram_index; + unsigned int* diagram_index; std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; - gpuMallocAsync(&momenta, rounded_count * CPPProcess::npar * 4 * sizeof(fptype), gpu_stream); - gpuMallocAsync(&couplings, rounded_count * n_coup * 2 * sizeof(fptype), gpu_stream); - gpuMallocAsync(&g_s, rounded_count * sizeof(fptype), gpu_stream); - gpuMallocAsync(&helicity_random, rounded_count * sizeof(fptype), gpu_stream); - gpuMallocAsync(&color_random, rounded_count * sizeof(fptype), gpu_stream); - gpuMallocAsync(&diagram_random, rounded_count * sizeof(fptype), gpu_stream); - gpuMallocAsync(&matrix_elements, rounded_count * sizeof(fptype), gpu_stream); - gpuMallocAsync(&diagram_index, rounded_count * sizeof(unsigned int), gpu_stream); - gpuMallocAsync(&color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof(fptype), gpu_stream); - gpuMallocAsync(&numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof(fptype), gpu_stream); - gpuMallocAsync(&denominators, rounded_count * CPPProcess::ncomb * sizeof(fptype), gpu_stream); - gpuMallocAsync(&helicity_index, rounded_count * sizeof(int), gpu_stream); - gpuMallocAsync(&color_index, rounded_count * sizeof(int), gpu_stream); - gpuMallocAsync(&ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof(fptype), gpu_stream); - gpuMallocAsync(&ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof(fptype), gpu_stream); + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof(fptype), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof(fptype), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof(fptype), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof(fptype), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof(fptype), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof(fptype), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof(fptype), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof(unsigned int), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof(fptype), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof(fptype), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof(fptype), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof(int), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof(int), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof(fptype), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof(fptype), gpu_stream ); copy_inputs<<>>( - momenta_in, - random_helicity_in, - random_color_in, - random_diagram_in, - alpha_s_in, - momenta, - helicity_random, - color_random, - diagram_random, - g_s, - count, - stride, - offset + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); - computeDependentCouplings<<>>(g_s, couplings); - checkGpu(gpuPeekAtLastError()); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); // TODO: make things fully async (requires using events instead of synchronize in // the sigmaKin implementation) - gpuStreamSynchronize(gpu_stream); + gpuStreamSynchronize( gpu_stream ); - InterfaceInstance* instance = static_cast(handle); - if (!instance->initialized) { - initialize( - momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count - ); + InterfaceInstance* instance = static_cast( handle ); + if ( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); instance->initialized = true; } sigmaKin( - momenta, - couplings, - helicity_random, - color_random, - nullptr, - diagram_random, - matrix_elements, - helicity_index, - color_index, - color_jamps, - numerators, - denominators, - diagram_index, - false, - ghel_matrix_elements, - ghel_jamps, - nullptr, - nullptr, - instance->hel_streams, - n_blocks, - n_threads - ); + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + copy_outputs<<>>( - denominators, - numerators, - matrix_elements, - diagram_index, - color_index, - helicity_index, - m2_out, - amp2_out, - diagram_out, - color_out, - helicity_out, - count, - stride, - offset - ); + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); checkGpu(gpuPeekAtLastError()); - gpuFreeAsync(momenta, gpu_stream); - gpuFreeAsync(couplings, gpu_stream); - gpuFreeAsync(g_s, gpu_stream); - gpuFreeAsync(helicity_random, gpu_stream); - gpuFreeAsync(color_random, gpu_stream); - gpuFreeAsync(diagram_random, gpu_stream); - gpuFreeAsync(matrix_elements, gpu_stream); - gpuFreeAsync(diagram_index, gpu_stream); - gpuFreeAsync(color_jamps, gpu_stream); - gpuFreeAsync(numerators, gpu_stream); - gpuFreeAsync(denominators, gpu_stream); - gpuFreeAsync(helicity_index, gpu_stream); - gpuFreeAsync(color_index, gpu_stream); - gpuFreeAsync(ghel_matrix_elements, gpu_stream); - gpuFreeAsync(ghel_jamps, gpu_stream); - -#else // MGONGPUCPP_GPUIMPL + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL // need to round to round to double page size for some reason std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; - std::size_t rounded_count = (count + page_size2 - 1) / page_size2 * page_size2; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; - HostBufferBase momenta(rounded_count * CPPProcess::npar * 4); + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); HostBufferBase couplings( - rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 - ); - HostBufferBase g_s(rounded_count); - HostBufferBase helicity_random(rounded_count); - HostBufferBase color_random(rounded_count); - HostBufferBase diagram_random(rounded_count); - HostBufferBase matrix_elements(rounded_count); - HostBufferBase diagram_index(rounded_count); - HostBufferBase numerators(rounded_count * CPPProcess::ndiagrams); - HostBufferBase denominators(rounded_count); - HostBufferBase helicity_index(rounded_count); - HostBufferBase color_index(rounded_count); - - for (std::size_t i_event = 0; i_event < count; ++i_event) { - transpose_momenta(&momenta_in[offset], momenta.data(), i_event, stride); - helicity_random[i_event] = random_helicity_in ? - random_helicity_in[i_event + offset] : 0.5; - color_random[i_event] = random_color_in ? - random_color_in[i_event + offset] : 0.5; - diagram_random[i_event] = random_diagram_in ? - random_diagram_in[i_event + offset] : 0.5; - g_s[i_event] = alpha_s_in ? - sqrt(4 * M_PI * alpha_s_in[i_event + offset]) : 1.2177157847767195; - } - computeDependentCouplings( - g_s.data(), couplings.data(), rounded_count + rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); - - InterfaceInstance* instance = static_cast(handle); - if (!instance->initialized) { - initialize( - momenta.data(), - couplings.data(), - matrix_elements.data(), - numerators.data(), - denominators.data(), - rounded_count - ); - instance->initialized = true; + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for ( std::size_t i_event = 0; i_event < count; ++i_event) + { + transpose_momenta(&momenta_in[offset], momenta.data(), i_event, stride); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt(4 * M_PI * alpha_s_in[i_event + offset]) : 1.2177157847767195; } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); - sigmaKin( + InterfaceInstance* instance = static_cast( handle ); + if (!instance->initialized) + { + initialize( momenta.data(), couplings.data(), - helicity_random.data(), - color_random.data(), - nullptr, - diagram_random.data(), matrix_elements.data(), - helicity_index.data(), - color_index.data(), numerators.data(), denominators.data(), - diagram_index.data(), - false, - rounded_count - ); + rounded_count); + instance->initialized = true; + } - std::size_t page_size = MemoryAccessMomentaBase::neppM; - for (std::size_t i_event = 0; i_event < count; ++i_event) { - std::size_t i_page = i_event / page_size; - std::size_t i_vector = i_event % page_size; + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); - double denominator = denominators[i_event]; - if (m2_out != nullptr) { - m2_out[i_event + offset] = matrix_elements[i_event]; - } - if (amp2_out != nullptr) { - for (std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag) { - amp2_out[stride * i_diag + i_event + offset] = numerators[ - i_page * page_size * CPPProcess::ndiagrams + - i_diag * page_size + i_vector - ] / denominator; - } - } - if (diagram_out != nullptr) { - diagram_out[i_event + offset] = diagram_index[i_event] - 1; - } - if (color_out != nullptr) { - color_out[i_event + offset] = color_index[i_event] - 1; - } - if (helicity_out != nullptr) { - helicity_out[i_event + offset] = helicity_index[i_event] - 1; + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for (std::size_t i_event = 0; i_event < count; ++i_event) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if (m2_out != nullptr) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if (amp2_out != nullptr) + { + for (std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; } + } + if (diagram_out != nullptr) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if (color_out != nullptr) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if (helicity_out != nullptr) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } } #endif // MGONGPUCPP_GPUIMPL return UMAMI_SUCCESS; } -UmamiStatus umami_free(UmamiHandle handle) { - InterfaceInstance* instance = static_cast(handle); +UmamiStatus umami_free(UmamiHandle handle) +{ + InterfaceInstance* instance = static_cast(handle); #ifdef MGONGPUCPP_GPUIMPL - for (int ihel = 0; ihel < CPPProcess::ncomb; ihel++) { - if (instance->hel_streams[ihel]) gpuStreamDestroy(instance->hel_streams[ihel]); - } + for (int ihel = 0; ihel < CPPProcess::ncomb; ihel++) + { + if (instance->hel_streams[ihel]) gpuStreamDestroy(instance->hel_streams[ihel]); + } #endif - delete instance; - return UMAMI_SUCCESS; + delete instance; + return UMAMI_SUCCESS; } } diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h index 29327f7bf9..8e86854c45 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h @@ -17,21 +17,24 @@ #include #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif -/** - * Major version number of the UMAMI interface. If the major version is the same - * between caller and implementation, binary compatibility is ensured. - */ -const inline int UMAMI_MAJOR_VERSION = 1; -/** - * Minor version number of the UMAMI interface. Between minor versions, new keys for - * errors, devices, metadata, inputs and outputs can be added. - */ -const inline int UMAMI_MINOR_VERSION = 0; + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; -typedef enum { + typedef enum + { UMAMI_SUCCESS, UMAMI_ERROR, UMAMI_ERROR_NOT_IMPLEMENTED, @@ -39,23 +42,26 @@ typedef enum { UMAMI_ERROR_UNSUPPORTED_OUTPUT, UMAMI_ERROR_UNSUPPORTED_META, UMAMI_ERROR_MISSING_INPUT, -} UmamiStatus; + } UmamiStatus; -typedef enum { + typedef enum + { UMAMI_DEVICE_CPU, UMAMI_DEVICE_CUDA, UMAMI_DEVICE_HIP, -} UmamiDevice; + } UmamiDevice; -typedef enum { + typedef enum + { UMAMI_META_DEVICE, UMAMI_META_PARTICLE_COUNT, UMAMI_META_DIAGRAM_COUNT, UMAMI_META_HELICITY_COUNT, UMAMI_META_COLOR_COUNT, -} UmamiMetaKey; + } UmamiMetaKey; -typedef enum { + typedef enum + { UMAMI_IN_MOMENTA, UMAMI_IN_ALPHA_S, UMAMI_IN_FLAVOR_INDEX, @@ -64,9 +70,10 @@ typedef enum { UMAMI_IN_RANDOM_DIAGRAM, UMAMI_IN_HELICITY_INDEX, UMAMI_IN_DIAGRAM_INDEX, -} UmamiInputKey; + } UmamiInputKey; -typedef enum { + typedef enum + { UMAMI_OUT_MATRIX_ELEMENT, UMAMI_OUT_DIAGRAM_AMP2, UMAMI_OUT_COLOR_INDEX, @@ -75,113 +82,111 @@ typedef enum { UMAMI_OUT_GPU_STREAM, // NLO: born, virtual, poles, counterterms // color: LC-ME, FC-ME -} UmamiOutputKey; - -typedef void* UmamiHandle; - - -/** - * Creates an instance of the matrix element. Each instance is independent, so thread - * safety can be achieved by creating a separate one for every thread. - * - * @param meta_key - * path to the parameter file - * @param handle - * pointer to an instance of the subprocess. Has to be cleaned up by - * the caller with `free_subprocess`. - * @return - * UMAMI_SUCCESS on success, error code otherwise - */ -UmamiStatus umami_get_meta(UmamiMetaKey meta_key, void* result); - -/** - * Creates an instance of the matrix element. Each instance is independent, so thread - * safety can be achieved by creating a separate one for every thread. - * - * @param param_card_path - * path to the parameter file - * @param handle - * pointer to an instance of the subprocess. Has to be cleaned up by - * the caller with `free_subprocess`. - * @return - * UMAMI_SUCCESS on success, error code otherwise - */ -UmamiStatus umami_initialize(UmamiHandle* handle, char const* param_card_path); - -/** - * Sets the value of a model parameter - * - * @param handle - * handle of a matrix element instance - * @param name - * name of the parameter - * @param parameter_real - * real part of the parameter value - * @param parameter_imag - * imaginary part of the parameter value. Ignored for real valued parameters. - * @return - * UMAMI_SUCCESS on success, error code otherwise - */ -UmamiStatus umami_set_parameter( + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta(UmamiMetaKey meta_key, void* result); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize(UmamiHandle* handle, char const* param_card_path); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( UmamiHandle handle, char const* name, double parameter_real, double parameter_imag ); -/** - * Retrieves the value of a model parameter - * - * @param handle - * handle of a matrix element instance - * @param name - * name of the parameter - * @param parameter_real - * pointer to double to return real part of the parameter value - * @param parameter_imag - * pointer to double to return imaginary part of the parameter value. Ignored - * for real-valued parameters (i.e. you may pass a null pointer) - * @return - * UMAMI_SUCCESS on success, error code otherwise - */ -UmamiStatus umami_get_parameter( + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( UmamiHandle handle, char const* name, double* parameter_real, - double* parameter_imag -); - -/** - * Evaluates the matrix element as a function of the given inputs, filling the - * requested outputs. - * - * @param handle - * handle of a matrix element instance - * @param count - * number of events to evaluate the matrix element for - * @param stride - * stride of the batch dimension of the input and output arrays, see memory layout - * @param offset - * offset of the event index - * @param input_count - * number of inputs to the matrix element - * @param input_keys - * pointer to an array of input keys, length `input_count` - * @param inputs - * pointer to an array of void pointers to the inputs. The type of the inputs - * depends on the input key - * @param output_count - * number of outputs to the matrix element - * @param output_keys - * pointer to an array of output keys, length `output_count` - * @param outputs - * pointer to an array of void pointers to the outputs. The type of the outputs - * depends on the output key. The caller is responsible for allocating memory for - * the outputs. - * @return - * UMAMI_SUCCESS on success, error code otherwise - */ -UmamiStatus umami_matrix_element( + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( UmamiHandle handle, size_t count, size_t stride, @@ -191,16 +196,15 @@ UmamiStatus umami_matrix_element( void const* const* inputs, size_t output_count, UmamiOutputKey const* output_keys, - void* const* outputs -); - -/** - * Frees matrix element instance - * - * @param handle - * handle of a matrix element instance - */ -UmamiStatus umami_free(UmamiHandle handle); + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free(UmamiHandle handle); #ifdef __cplusplus } From 12f9c4d821d738b9d5194b5a25b86257c83c4f9c Mon Sep 17 00:00:00 2001 From: Stefan Roiser Date: Thu, 11 Dec 2025 16:47:45 +0100 Subject: [PATCH 17/18] regenerate all processes --- .../gpu/process_function_definitions.inc | 7 +- .../gpu/process_sigmaKin_function.inc | 7 +- .../iolibs/template_files/gpu/umami.cc | 236 +- .../iolibs/template_files/gpu/umami.h | 9 +- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 28 +- .../ee_mumu.mad/SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../SubProcesses/P1_epem_mupmum/CPPProcess.cc | 261 +- .../SubProcesses/P1_epem_mupmum/CPPProcess.h | 6 + .../ee_mumu.mad/SubProcesses/cudacpp.mk | 4 +- .../CODEGEN_cudacpp_ee_mumu_log.txt | 28 +- .../ee_mumu.sa/SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../P1_Sigma_sm_epem_mupmum/CPPProcess.cc | 257 +- .../P1_Sigma_sm_epem_mupmum/CPPProcess.h | 6 + .../ee_mumu.sa/SubProcesses/cudacpp.mk | 4 +- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 26 +- .../gg_tt.mad/SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 263 +- .../SubProcesses/P1_gg_ttx/CPPProcess.h | 6 + .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 4 +- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 28 +- .../gg_tt.sa/SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../P1_Sigma_sm_gg_ttx/CPPProcess.cc | 257 +- .../P1_Sigma_sm_gg_ttx/CPPProcess.h | 6 + .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 4 +- .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 28 +- .../SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 263 +- .../SubProcesses/P1_gg_ttx/CPPProcess.h | 6 + .../SubProcesses/P2_gg_ttxg/CPPProcess.cc | 287 ++- .../SubProcesses/P2_gg_ttxg/CPPProcess.h | 6 + .../gg_tt01g.mad/SubProcesses/cudacpp.mk | 4 +- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 26 +- .../gg_ttg.mad/SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 287 ++- .../SubProcesses/P1_gg_ttxg/CPPProcess.h | 6 + .../gg_ttg.mad/SubProcesses/cudacpp.mk | 4 +- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 26 +- .../gg_ttg.sa/SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../P1_Sigma_sm_gg_ttxg/CPPProcess.cc | 257 +- .../P1_Sigma_sm_gg_ttxg/CPPProcess.h | 6 + .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 4 +- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 26 +- .../gg_ttgg.mad/SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc | 467 ++-- .../SubProcesses/P1_gg_ttxgg/CPPProcess.h | 6 + .../gg_ttgg.mad/SubProcesses/cudacpp.mk | 4 +- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 30 +- .../gg_ttgg.sa/SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../P1_Sigma_sm_gg_ttxgg/CPPProcess.cc | 257 +- .../P1_Sigma_sm_gg_ttxgg/CPPProcess.h | 6 + .../gg_ttgg.sa/SubProcesses/cudacpp.mk | 4 +- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 26 +- .../SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc | 2147 +++++++++-------- .../SubProcesses/P1_gg_ttxggg/CPPProcess.h | 6 + .../gg_ttggg.mad/SubProcesses/cudacpp.mk | 4 +- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 30 +- .../gg_ttggg.sa/SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../P1_Sigma_sm_gg_ttxggg/CPPProcess.cc | 257 +- .../P1_Sigma_sm_gg_ttxggg/CPPProcess.h | 6 + .../gg_ttggg.sa/SubProcesses/cudacpp.mk | 4 +- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 26 +- .../gq_ttq.mad/SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../SubProcesses/P1_gu_ttxu/CPPProcess.cc | 267 +- .../SubProcesses/P1_gu_ttxu/CPPProcess.h | 6 + .../SubProcesses/P1_gux_ttxux/CPPProcess.cc | 267 +- .../SubProcesses/P1_gux_ttxux/CPPProcess.h | 6 + .../gq_ttq.mad/SubProcesses/cudacpp.mk | 4 +- .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 38 +- .../gq_ttq.sa/SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../P1_Sigma_sm_gu_ttxu/CPPProcess.cc | 257 +- .../P1_Sigma_sm_gu_ttxu/CPPProcess.h | 6 + .../P1_Sigma_sm_gux_ttxux/CPPProcess.cc | 257 +- .../P1_Sigma_sm_gux_ttxux/CPPProcess.h | 6 + .../cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk | 4 +- .../CODEGEN_mad_heft_gg_bb_log.txt | 22 +- .../SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../SubProcesses/P1_gg_bbx/CPPProcess.cc | 265 +- .../SubProcesses/P1_gg_bbx/CPPProcess.h | 6 + .../heft_gg_bb.mad/SubProcesses/cudacpp.mk | 4 +- .../CODEGEN_cudacpp_heft_gg_bb_log.txt | 74 +- .../SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../P1_Sigma_heft_gg_bbx/CPPProcess.cc | 257 +- .../P1_Sigma_heft_gg_bbx/CPPProcess.h | 6 + .../heft_gg_bb.sa/SubProcesses/cudacpp.mk | 4 +- .../CODEGEN_mad_nobm_pp_ttW_log.txt | 28 +- .../SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../SubProcesses/P0_dux_ttxwm/CPPProcess.cc | 261 +- .../SubProcesses/P0_dux_ttxwm/CPPProcess.h | 6 + .../SubProcesses/P0_udx_ttxwp/CPPProcess.cc | 261 +- .../SubProcesses/P0_udx_ttxwp/CPPProcess.h | 6 + .../SubProcesses/P1_dux_ttxwmg/CPPProcess.cc | 281 ++- .../SubProcesses/P1_dux_ttxwmg/CPPProcess.h | 6 + .../SubProcesses/P1_gd_ttxwmu/CPPProcess.cc | 281 ++- .../SubProcesses/P1_gd_ttxwmu/CPPProcess.h | 6 + .../SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc | 281 ++- .../SubProcesses/P1_gdx_ttxwpux/CPPProcess.h | 6 + .../SubProcesses/P1_gu_ttxwpd/CPPProcess.cc | 281 ++- .../SubProcesses/P1_gu_ttxwpd/CPPProcess.h | 6 + .../SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc | 281 ++- .../SubProcesses/P1_gux_ttxwmdx/CPPProcess.h | 6 + .../SubProcesses/P1_udx_ttxwpg/CPPProcess.cc | 281 ++- .../SubProcesses/P1_udx_ttxwpg/CPPProcess.h | 6 + .../nobm_pp_ttW.mad/SubProcesses/cudacpp.mk | 4 +- .../CODEGEN_mad_pp_tt012j_log.txt | 30 +- .../SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../SubProcesses/P0_gg_ttx/CPPProcess.cc | 263 +- .../SubProcesses/P0_gg_ttx/CPPProcess.h | 6 + .../SubProcesses/P0_uux_ttx/CPPProcess.cc | 259 +- .../SubProcesses/P0_uux_ttx/CPPProcess.h | 6 + .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 287 ++- .../SubProcesses/P1_gg_ttxg/CPPProcess.h | 6 + .../SubProcesses/P1_gu_ttxu/CPPProcess.cc | 267 +- .../SubProcesses/P1_gu_ttxu/CPPProcess.h | 6 + .../SubProcesses/P1_gux_ttxux/CPPProcess.cc | 267 +- .../SubProcesses/P1_gux_ttxux/CPPProcess.h | 6 + .../SubProcesses/P1_uux_ttxg/CPPProcess.cc | 267 +- .../SubProcesses/P1_uux_ttxg/CPPProcess.h | 6 + .../SubProcesses/P2_gg_ttxgg/CPPProcess.cc | 467 ++-- .../SubProcesses/P2_gg_ttxgg/CPPProcess.h | 6 + .../SubProcesses/P2_gg_ttxuux/CPPProcess.cc | 327 +-- .../SubProcesses/P2_gg_ttxuux/CPPProcess.h | 6 + .../SubProcesses/P2_gu_ttxgu/CPPProcess.cc | 327 +-- .../SubProcesses/P2_gu_ttxgu/CPPProcess.h | 6 + .../SubProcesses/P2_gux_ttxgux/CPPProcess.cc | 327 +-- .../SubProcesses/P2_gux_ttxgux/CPPProcess.h | 6 + .../SubProcesses/P2_uc_ttxuc/CPPProcess.cc | 271 ++- .../SubProcesses/P2_uc_ttxuc/CPPProcess.h | 6 + .../SubProcesses/P2_ucx_ttxucx/CPPProcess.cc | 271 ++- .../SubProcesses/P2_ucx_ttxucx/CPPProcess.h | 6 + .../SubProcesses/P2_uu_ttxuu/CPPProcess.cc | 285 ++- .../SubProcesses/P2_uu_ttxuu/CPPProcess.h | 6 + .../SubProcesses/P2_uux_ttxccx/CPPProcess.cc | 271 ++- .../SubProcesses/P2_uux_ttxccx/CPPProcess.h | 6 + .../SubProcesses/P2_uux_ttxgg/CPPProcess.cc | 327 +-- .../SubProcesses/P2_uux_ttxgg/CPPProcess.h | 6 + .../SubProcesses/P2_uux_ttxuux/CPPProcess.cc | 285 ++- .../SubProcesses/P2_uux_ttxuux/CPPProcess.h | 6 + .../P2_uxcx_ttxuxcx/CPPProcess.cc | 271 ++- .../SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h | 6 + .../P2_uxux_ttxuxux/CPPProcess.cc | 285 ++- .../SubProcesses/P2_uxux_ttxuxux/CPPProcess.h | 6 + .../pp_tt012j.mad/SubProcesses/cudacpp.mk | 4 +- .../CODEGEN_mad_smeft_gg_tttt_log.txt | 26 +- .../SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../SubProcesses/P1_gg_ttxttx/CPPProcess.cc | 397 +-- .../SubProcesses/P1_gg_ttxttx/CPPProcess.h | 6 + .../smeft_gg_tttt.mad/SubProcesses/cudacpp.mk | 4 +- .../CODEGEN_cudacpp_smeft_gg_tttt_log.txt | 66 +- .../SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../CPPProcess.cc | 257 +- .../CPPProcess.h | 6 + .../smeft_gg_tttt.sa/SubProcesses/cudacpp.mk | 4 +- .../CODEGEN_mad_susy_gg_t1t1_log.txt | 22 +- .../SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../SubProcesses/P1_gg_t1t1x/CPPProcess.cc | 267 +- .../SubProcesses/P1_gg_t1t1x/CPPProcess.h | 6 + .../susy_gg_t1t1.mad/SubProcesses/cudacpp.mk | 4 +- .../CODEGEN_cudacpp_susy_gg_t1t1_log.txt | 28 +- .../SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../CPPProcess.cc | 257 +- .../P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h | 6 + .../susy_gg_t1t1.sa/SubProcesses/cudacpp.mk | 4 +- .../CODEGEN_mad_susy_gg_tt_log.txt | 24 +- .../SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 263 +- .../SubProcesses/P1_gg_ttx/CPPProcess.h | 6 + .../susy_gg_tt.mad/SubProcesses/cudacpp.mk | 4 +- .../CODEGEN_cudacpp_susy_gg_tt_log.txt | 31 +- .../SubProcesses/GpuAbstraction.h | 6 + .../SubProcesses/MatrixElementKernels.cc | 10 +- .../P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc | 257 +- .../P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h | 6 + .../susy_gg_tt.sa/SubProcesses/cudacpp.mk | 4 +- 196 files changed, 10534 insertions(+), 7262 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 9c409d4508..dccdf2e736 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -605,12 +605,13 @@ namespace mg5amcCpu totAllDenominators[ievt] += hAllDenominators[ievt]; fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; - for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag) + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) { firstNumerator[idiag] += hAllNumerators[idiag]; } } - if (mulChannelWeight) { + if( mulChannelWeight ) + { unsigned int channelId = allChannelIds[ievt]; allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } @@ -674,7 +675,7 @@ namespace mg5amcCpu // Event-by-event random choice of color #402 // Event-by-event random choice of channel - if( allrnddiagram != nullptr ) + if( allrnddiagram != nullptr ) { fptype numerator_sum = 0., normalization = 0.; for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index a65e806d08..aa7ad1165e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -189,7 +189,7 @@ const int vecsize = neppV; #endif unsigned int channelIdVec[vecsize]; - if (allChannelIds != nullptr) + if( allChannelIds != nullptr ) { for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { @@ -199,7 +199,8 @@ } // Event-by-event random choice of channel - if ( allrnddiagram != nullptr ) { + if( allrnddiagram != nullptr ) + { for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; @@ -259,7 +260,7 @@ else targetamp[icolC] = targetamp[icolC - 1]; if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += - jamp2_sv[icolC + ncolor * (ieppV / neppV)][ieppV %% neppV]; + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV %% neppV]; } const int ievt = ievt00 + ieppV; //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc index 348597b9bb..2b52267519 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc @@ -13,7 +13,7 @@ using namespace mg5amcGpu; using namespace mg5amcCpu; #endif -namespace +namespace { void* initialize_impl( @@ -25,18 +25,18 @@ namespace #endif fptype* numerators, fptype* denominators, - std::size_t count ) + std::size_t count ) { bool is_good_hel[CPPProcess::ncomb]; sigmaKin_getGoodHel( - momenta, couplings, matrix_elements, numerators, denominators, + momenta, couplings, matrix_elements, numerators, denominators, #ifdef MGONGPUCPP_GPUIMPL - color_jamps, + color_jamps, #endif - is_good_hel, - count ); - sigmaKin_setGoodHel(is_good_hel); - return nullptr; + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; } void initialize( @@ -53,31 +53,32 @@ namespace // static local initialization is called exactly once in a thread-safe way static void* dummy = initialize_impl( momenta, couplings, matrix_elements, #ifdef MGONGPUCPP_GPUIMPL - color_jamps, + color_jamps, #endif - numerators, denominators, count - ); -} + numerators, + denominators, + count ); + } #ifdef MGONGPUCPP_GPUIMPL __device__ #endif void - transpose_momenta( - const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) { - std::size_t page_size = MemoryAccessMomentaBase::neppM; - std::size_t i_page = i_event / page_size; - std::size_t i_vector = i_event % page_size; - - for (std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part) { - for(std::size_t i_mom = 0; i_mom < 4; ++i_mom) { - momenta_out[i_page * CPPProcess::npar * 4 * page_size + - i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( - CPPProcess::npar * i_mom + i_part) + i_event]; - } + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } } -} + } #ifdef MGONGPUCPP_GPUIMPL @@ -97,13 +98,13 @@ namespace std::size_t offset ) { std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; - if ( i_event >= count ) return; + if( i_event >= count ) return; transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; - g_s[i_event] = alpha_s_in ? sqrt(4 * M_PI * alpha_s_in[i_event + offset]) : 1.2177157847767195; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; } __global__ void copy_outputs( @@ -123,25 +124,25 @@ namespace std::size_t offset ) { std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; - if ( i_event >= count ) return; + if( i_event >= count ) return; - if ( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; - if ( amp2_out) + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) { double denominator = denominators[i_event]; - for (std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag) + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) { amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; } } - if ( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; - if ( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; - if ( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; -} + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } #endif // MGONGPUCPP_GPUIMPL - struct InterfaceInstance + struct InterfaceInstance { bool initialized = false; #ifdef MGONGPUCPP_GPUIMPL @@ -153,14 +154,13 @@ namespace extern "C" { - - UmamiStatus umami_get_meta(UmamiMetaKey meta_key, void* result) + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) { - switch (meta_key) + switch( meta_key ) { - case UMAMI_META_DEVICE: + case UMAMI_META_DEVICE: { - UmamiDevice& device = *static_cast(result); + UmamiDevice& device = *static_cast( result ); #ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ device = UMAMI_DEVICE_CUDA; @@ -171,7 +171,7 @@ extern "C" device = UMAMI_DEVICE_CPU; #endif break; - } + } case UMAMI_META_PARTICLE_COUNT: *static_cast( result ) = CPPProcess::npar; break; @@ -187,44 +187,41 @@ extern "C" return UMAMI_ERROR_UNSUPPORTED_META; } return UMAMI_SUCCESS; -} - + } - UmamiStatus umami_initialize(UmamiHandle* handle, char const* param_card_path) + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) { CPPProcess process; process.initProc( param_card_path ); auto instance = new InterfaceInstance(); *handle = instance; #ifdef MGONGPUCPP_GPUIMPL - for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) { - gpuStreamCreate(&instance->hel_streams[ihel]); + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); } #endif return UMAMI_SUCCESS; } - UmamiStatus umami_set_parameter( UmamiHandle handle, char const* name, double parameter_real, - double parameter_imag ) + double parameter_imag ) { return UMAMI_ERROR_NOT_IMPLEMENTED; } - UmamiStatus umami_get_parameter( UmamiHandle handle, char const* name, double* parameter_real, - double* parameter_imag) + double* parameter_imag ) { return UMAMI_ERROR_NOT_IMPLEMENTED; } - UmamiStatus umami_matrix_element( UmamiHandle handle, size_t count, @@ -236,7 +233,7 @@ extern "C" size_t output_count, UmamiOutputKey const* output_keys, void* const* outputs ) -{ + { const double* momenta_in = nullptr; const double* alpha_s_in = nullptr; const int* flavor_in = nullptr; // TODO: unused @@ -245,39 +242,39 @@ extern "C" const double* random_diagram_in = nullptr; const int* diagram_in = nullptr; // TODO: unused - for (std::size_t i = 0; i < input_count; ++i) + for( std::size_t i = 0; i < input_count; ++i ) { const void* input = inputs[i]; - switch (input_keys[i]) + switch( input_keys[i] ) { case UMAMI_IN_MOMENTA: - momenta_in = static_cast(input); + momenta_in = static_cast( input ); break; case UMAMI_IN_ALPHA_S: - alpha_s_in = static_cast(input); + alpha_s_in = static_cast( input ); break; case UMAMI_IN_FLAVOR_INDEX: - flavor_in = static_cast(input); + flavor_in = static_cast( input ); break; case UMAMI_IN_RANDOM_COLOR: - random_color_in = static_cast(input); + random_color_in = static_cast( input ); break; case UMAMI_IN_RANDOM_HELICITY: - random_helicity_in = static_cast(input); + random_helicity_in = static_cast( input ); break; case UMAMI_IN_RANDOM_DIAGRAM: - random_diagram_in = static_cast(input); + random_diagram_in = static_cast( input ); break; case UMAMI_IN_HELICITY_INDEX: return UMAMI_ERROR_UNSUPPORTED_INPUT; case UMAMI_IN_DIAGRAM_INDEX: - diagram_in = static_cast(input); + diagram_in = static_cast( input ); break; default: return UMAMI_ERROR_UNSUPPORTED_INPUT; } } - if ( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; #ifdef MGONGPUCPP_GPUIMPL gpuStream_t gpu_stream = nullptr; @@ -290,26 +287,26 @@ extern "C" for( std::size_t i = 0; i < output_count; ++i ) { void* output = outputs[i]; - switch (output_keys[i]) + switch( output_keys[i] ) { case UMAMI_OUT_MATRIX_ELEMENT: - m2_out = static_cast(output); + m2_out = static_cast( output ); break; case UMAMI_OUT_DIAGRAM_AMP2: - amp2_out = static_cast(output); + amp2_out = static_cast( output ); break; case UMAMI_OUT_COLOR_INDEX: - color_out = static_cast(output); + color_out = static_cast( output ); break; case UMAMI_OUT_HELICITY_INDEX: - helicity_out = static_cast(output); + helicity_out = static_cast( output ); break; case UMAMI_OUT_DIAGRAM_INDEX: - diagram_out = static_cast(output); + diagram_out = static_cast( output ); break; #ifdef MGONGPUCPP_GPUIMPL case UMAMI_OUT_GPU_STREAM: - gpu_stream = static_cast(output); + gpu_stream = static_cast( output ); break; #endif default: @@ -328,21 +325,21 @@ extern "C" unsigned int* diagram_index; std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; - gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof(fptype), gpu_stream ); - gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof(fptype), gpu_stream ); - gpuMallocAsync( &g_s, rounded_count * sizeof(fptype), gpu_stream ); - gpuMallocAsync( &helicity_random, rounded_count * sizeof(fptype), gpu_stream ); - gpuMallocAsync( &color_random, rounded_count * sizeof(fptype), gpu_stream ); - gpuMallocAsync( &diagram_random, rounded_count * sizeof(fptype), gpu_stream ); - gpuMallocAsync( &matrix_elements, rounded_count * sizeof(fptype), gpu_stream ); - gpuMallocAsync( &diagram_index, rounded_count * sizeof(unsigned int), gpu_stream ); - gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof(fptype), gpu_stream ); - gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof(fptype), gpu_stream ); - gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof(fptype), gpu_stream ); - gpuMallocAsync( &helicity_index, rounded_count * sizeof(int), gpu_stream ); - gpuMallocAsync( &color_index, rounded_count * sizeof(int), gpu_stream ); - gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof(fptype), gpu_stream ); - gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof(fptype), gpu_stream ); + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); copy_inputs<<>>( momenta_in, @@ -357,8 +354,7 @@ extern "C" g_s, count, stride, - offset - ); + offset ); computeDependentCouplings<<>>( g_s, couplings ); checkGpu( gpuPeekAtLastError() ); // TODO: make things fully async (requires using events instead of synchronize in @@ -366,11 +362,11 @@ extern "C" gpuStreamSynchronize( gpu_stream ); InterfaceInstance* instance = static_cast( handle ); - if ( !instance->initialized ) + if( !instance->initialized ) { initialize( momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); - instance->initialized = true; + instance->initialized = true; } sigmaKin( @@ -411,7 +407,7 @@ extern "C" count, stride, offset ); - checkGpu(gpuPeekAtLastError()); + checkGpu( gpuPeekAtLastError() ); gpuFreeAsync( momenta, gpu_stream ); gpuFreeAsync( couplings, gpu_stream ); @@ -428,15 +424,13 @@ extern "C" gpuFreeAsync( color_index, gpu_stream ); gpuFreeAsync( ghel_matrix_elements, gpu_stream ); gpuFreeAsync( ghel_jamps, gpu_stream ); -#else // MGONGPUCPP_GPUIMPL +#else // MGONGPUCPP_GPUIMPL // need to round to round to double page size for some reason std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); - HostBufferBase couplings( - rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 - ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); HostBufferBase g_s( rounded_count ); HostBufferBase helicity_random( rounded_count ); HostBufferBase color_random( rounded_count ); @@ -447,18 +441,18 @@ extern "C" HostBufferBase denominators( rounded_count ); HostBufferBase helicity_index( rounded_count ); HostBufferBase color_index( rounded_count ); - for ( std::size_t i_event = 0; i_event < count; ++i_event) + for( std::size_t i_event = 0; i_event < count; ++i_event ) { - transpose_momenta(&momenta_in[offset], momenta.data(), i_event, stride); + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; - color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; - diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; - g_s[i_event] = alpha_s_in ? sqrt(4 * M_PI * alpha_s_in[i_event + offset]) : 1.2177157847767195; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; } computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); InterfaceInstance* instance = static_cast( handle ); - if (!instance->initialized) + if( !instance->initialized ) { initialize( momenta.data(), @@ -466,8 +460,8 @@ extern "C" matrix_elements.data(), numerators.data(), denominators.data(), - rounded_count); - instance->initialized = true; + rounded_count ); + instance->initialized = true; } sigmaKin( @@ -487,52 +481,50 @@ extern "C" rounded_count ); std::size_t page_size = MemoryAccessMomentaBase::neppM; - for (std::size_t i_event = 0; i_event < count; ++i_event) + for( std::size_t i_event = 0; i_event < count; ++i_event ) { std::size_t i_page = i_event / page_size; std::size_t i_vector = i_event % page_size; double denominator = denominators[i_event]; - if (m2_out != nullptr) + if( m2_out != nullptr ) { m2_out[i_event + offset] = matrix_elements[i_event]; } - if (amp2_out != nullptr) + if( amp2_out != nullptr ) { - for (std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag) + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) { amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; } } - if (diagram_out != nullptr) + if( diagram_out != nullptr ) { diagram_out[i_event + offset] = diagram_index[i_event] - 1; } - if (color_out != nullptr) + if( color_out != nullptr ) { color_out[i_event + offset] = color_index[i_event] - 1; } - if (helicity_out != nullptr) + if( helicity_out != nullptr ) { helicity_out[i_event + offset] = helicity_index[i_event] - 1; } } #endif // MGONGPUCPP_GPUIMPL return UMAMI_SUCCESS; -} - + } -UmamiStatus umami_free(UmamiHandle handle) -{ - InterfaceInstance* instance = static_cast(handle); -#ifdef MGONGPUCPP_GPUIMPL - for (int ihel = 0; ihel < CPPProcess::ncomb; ihel++) + UmamiStatus umami_free( UmamiHandle handle ) { - if (instance->hel_streams[ihel]) gpuStreamDestroy(instance->hel_streams[ihel]); - } + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } #endif - delete instance; - return UMAMI_SUCCESS; -} - + delete instance; + return UMAMI_SUCCESS; + } } diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h index 8e86854c45..39ac6fe385 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h @@ -98,7 +98,7 @@ extern "C" * @return * UMAMI_SUCCESS on success, error code otherwise */ - UmamiStatus umami_get_meta(UmamiMetaKey meta_key, void* result); + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); /** * Creates an instance of the matrix element. Each instance is independent, so thread @@ -112,7 +112,7 @@ extern "C" * @return * UMAMI_SUCCESS on success, error code otherwise */ - UmamiStatus umami_initialize(UmamiHandle* handle, char const* param_card_path); + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); /** * Sets the value of a model parameter @@ -132,8 +132,7 @@ extern "C" UmamiHandle handle, char const* name, double parameter_real, - double parameter_imag -); + double parameter_imag ); /** * Retrieves the value of a model parameter @@ -204,7 +203,7 @@ extern "C" * @param handle * handle of a matrix element instance */ - UmamiStatus umami_free(UmamiHandle handle); + UmamiStatus umami_free( UmamiHandle handle ); #ifdef __cplusplus } diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index 7d83fbc2f6..cac2fc9257 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -58,7 +58,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0018742084503173828  +DEBUG: model prefixing takes 0.0017848014831542969  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,17 +150,17 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.006 s +1 processes with 2 diagrams generated in 0.007 s Total: 1 processes with 2 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  @@ -179,19 +179,19 @@ INFO: Finding symmetric diagrams for subprocess group epem_mupmum DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1564]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1588]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1589]  -Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 1.223 s +Generated helas calls for 1 subprocesses (2 diagrams) in 0.006 s +Wrote files for 8 helas calls in 1.876 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.120 s +ALOHA: aloha creates 3 routines in 0.129 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.120 s +ALOHA: aloha creates 7 routines in 0.131 s FFV1 FFV1 FFV2 @@ -215,17 +215,17 @@ INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  +DEBUG: result.returncode =  0 [output.py at line 275]  Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m6.564s -user 0m1.276s -sys 0m0.649s -Code generation completed in 7 seconds +real 0m9.417s +user 0m1.278s +sys 0m0.670s +Code generation completed in 10 seconds /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ @@ -254,6 +254,8 @@ INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEG Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run +/shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/madgraph/various/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ quit INFO: launch in debug mode diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc index 7f07184332..a536b3d076 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -455,7 +452,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -471,7 +468,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV2_4_0( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -789,9 +786,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -867,8 +863,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -930,25 +925,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -993,16 +998,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1070,6 +1103,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1079,6 +1113,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1090,8 +1126,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1134,7 +1172,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1153,7 +1191,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1165,6 +1205,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1173,9 +1214,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1186,9 +1228,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1226,9 +1274,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1243,7 +1288,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1291,82 +1337,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1391,13 +1452,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1405,7 +1460,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h index c9d280d0f6..3ca62dbc6e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 0312305458..6cd2239516 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -58,7 +58,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0028181076049804688  +DEBUG: model prefixing takes 0.001810312271118164  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -154,28 +154,28 @@ INFO: Process has 2 diagrams Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. -Generated helas calls for 1 subprocesses (2 diagrams) in 0.005 s +Generated helas calls for 1 subprocesses (2 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.116 s +ALOHA: aloha creates 4 routines in 0.122 s FFV1 FFV1 FFV2 @@ -194,7 +194,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m1.506s -user 0m0.432s -sys 0m0.135s -Code generation completed in 2 seconds +real 0m2.084s +user 0m0.455s +sys 0m0.171s +Code generation completed in 3 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index e030cc38c8..ec11e4c04b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -781,9 +778,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -859,8 +855,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -922,25 +917,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -985,16 +990,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1062,6 +1095,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1071,6 +1105,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1082,8 +1118,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1126,7 +1164,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1145,7 +1183,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1157,6 +1197,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1165,9 +1206,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1178,9 +1220,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1218,9 +1266,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1235,7 +1280,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1283,82 +1329,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1383,13 +1444,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1397,7 +1452,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h index c9d280d0f6..3ca62dbc6e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index f31e287e32..b485abf77b 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -58,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0018684864044189453  +DEBUG: model prefixing takes 0.0018193721771240234  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,17 +151,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.008 s +1 processes with 3 diagrams generated in 0.015 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  @@ -180,16 +180,16 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttx DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1564]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1588]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1589]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s -Wrote files for 10 helas calls in 0.925 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.009 s +Wrote files for 10 helas calls in 1.922 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.098 s +ALOHA: aloha creates 2 routines in 0.102 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.082 s +ALOHA: aloha creates 4 routines in 0.069 s VVV1 FFV1 FFV1 @@ -209,17 +209,17 @@ INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  +DEBUG: result.returncode =  0 [output.py at line 275]  Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m5.871s -user 0m1.183s -sys 0m0.587s -Code generation completed in 6 seconds +real 0m8.946s +user 0m1.272s +sys 0m0.626s +Code generation completed in 9 seconds /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 65712c3058..4204e595d8 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -455,7 +452,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -472,7 +469,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -488,7 +485,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -805,9 +802,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -883,8 +879,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -946,25 +941,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1009,16 +1014,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1086,6 +1119,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1095,6 +1129,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1106,8 +1142,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1150,7 +1188,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1169,7 +1207,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1181,6 +1221,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1189,9 +1230,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1202,9 +1244,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1242,9 +1290,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1259,7 +1304,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1307,82 +1353,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1407,13 +1468,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1421,7 +1476,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 26652fc739..5fdf36bb26 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 3410a9d9e8..d5886a1099 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -58,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0018236637115478516  +DEBUG: model prefixing takes 0.0018737316131591797  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,30 +151,30 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.007 s +1 processes with 3 diagrams generated in 0.011 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. -Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.060 s +ALOHA: aloha creates 2 routines in 0.065 s VVV1 FFV1 FFV1 @@ -189,7 +189,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m1.286s -user 0m0.361s -sys 0m0.100s +real 0m1.735s +user 0m0.348s +sys 0m0.112s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index 59e7d2f86c..bbc2c6c17c 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -793,9 +790,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -871,8 +867,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -934,25 +929,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -997,16 +1002,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1074,6 +1107,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1083,6 +1117,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1094,8 +1130,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1138,7 +1176,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1157,7 +1195,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1169,6 +1209,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1177,9 +1218,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1190,9 +1232,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1230,9 +1278,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1247,7 +1292,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1295,82 +1341,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1395,13 +1456,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1409,7 +1464,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h index 26652fc739..5fdf36bb26 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index eb37f81cd3..c6e2bc2275 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -58,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0017805099487304688  +DEBUG: model prefixing takes 0.00186920166015625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.008 s +1 processes with 3 diagrams generated in 0.012 s Total: 1 processes with 3 diagrams add process g g > t t~ g INFO: Checking for minimal orders which gives processes. @@ -159,17 +159,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.017 s +1 processes with 16 diagrams generated in 0.025 s Total: 2 processes with 19 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  @@ -201,22 +201,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttx DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1564]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1588]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1589]  -Generated helas calls for 2 subprocesses (19 diagrams) in 0.021 s -Wrote files for 46 helas calls in 2.631 s +Generated helas calls for 2 subprocesses (19 diagrams) in 0.023 s +Wrote files for 46 helas calls in 4.525 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.182 s +ALOHA: aloha creates 5 routines in 0.191 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.137 s +ALOHA: aloha creates 10 routines in 0.166 s VVV1 VVV1 FFV1 @@ -241,17 +241,17 @@ INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  +DEBUG: result.returncode =  0 [output.py at line 275]  Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m8.102s -user 0m1.532s -sys 0m0.740s -Code generation completed in 9 seconds +real 0m12.237s +user 0m1.681s +sys 0m0.791s +Code generation completed in 12 seconds /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 65712c3058..4204e595d8 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -455,7 +452,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -472,7 +469,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -488,7 +485,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -805,9 +802,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -883,8 +879,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -946,25 +941,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1009,16 +1014,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1086,6 +1119,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1095,6 +1129,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1106,8 +1142,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1150,7 +1188,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1169,7 +1207,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1181,6 +1221,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1189,9 +1230,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1202,9 +1244,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1242,9 +1290,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1259,7 +1304,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1307,82 +1353,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1407,13 +1468,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1421,7 +1476,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 26652fc739..5fdf36bb26 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc index fa3d841089..e32a6ef9b4 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -458,7 +455,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -477,7 +474,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -494,7 +491,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -512,7 +509,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -528,7 +525,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -545,7 +542,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -562,7 +559,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -578,7 +575,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -595,7 +592,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -611,7 +608,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -628,7 +625,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -645,7 +642,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -664,7 +661,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[12] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -680,7 +677,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[13] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -696,7 +693,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[14] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1058,9 +1055,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1136,8 +1132,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1199,25 +1194,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1262,16 +1267,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1339,6 +1372,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1348,6 +1382,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1359,8 +1395,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1403,7 +1441,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1422,7 +1460,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1434,6 +1474,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1442,9 +1483,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1455,9 +1497,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1495,9 +1543,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1512,7 +1557,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1560,82 +1606,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1660,13 +1721,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1674,7 +1729,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h index a0fbcbb773..a49500a023 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 9f1c215d46..d7b2672731 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -58,7 +58,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0018467903137207031  +DEBUG: model prefixing takes 0.0017828941345214844  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,17 +151,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.020 s +1 processes with 16 diagrams generated in 0.024 s Total: 1 processes with 16 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  @@ -180,22 +180,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxg DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1564]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1588]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1589]  -Generated helas calls for 1 subprocesses (16 diagrams) in 0.015 s -Wrote files for 36 helas calls in 1.413 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.016 s +Wrote files for 36 helas calls in 2.297 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.178 s +ALOHA: aloha creates 5 routines in 0.185 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.134 s +ALOHA: aloha creates 10 routines in 0.172 s VVV1 VVV1 FFV1 @@ -220,17 +220,17 @@ INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  +DEBUG: result.returncode =  0 [output.py at line 275]  Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m6.988s -user 0m1.465s -sys 0m0.615s -Code generation completed in 8 seconds +real 0m9.939s +user 0m1.538s +sys 0m0.629s +Code generation completed in 10 seconds /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index c2f3ee7141..b8f69df605 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -458,7 +455,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -477,7 +474,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -494,7 +491,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -512,7 +509,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -528,7 +525,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -545,7 +542,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -562,7 +559,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -578,7 +575,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -595,7 +592,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -611,7 +608,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -628,7 +625,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -645,7 +642,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -664,7 +661,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[12] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -680,7 +677,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[13] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -696,7 +693,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[14] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1058,9 +1055,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1136,8 +1132,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1199,25 +1194,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1262,16 +1267,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1339,6 +1372,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1348,6 +1382,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1359,8 +1395,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1403,7 +1441,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1422,7 +1460,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1434,6 +1474,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1442,9 +1483,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1455,9 +1497,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1495,9 +1543,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1512,7 +1557,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1560,82 +1606,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1660,13 +1721,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1674,7 +1729,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 1b49cac30b..1b956214b7 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index 8002f4efbd..c635672d98 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -58,7 +58,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0017540454864501953  +DEBUG: model prefixing takes 0.0044193267822265625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,17 +155,17 @@ INFO: Process has 16 diagrams Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc @@ -177,7 +177,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.142 s +ALOHA: aloha creates 5 routines in 0.161 s VVV1 VVV1 FFV1 @@ -197,7 +197,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m1.685s -user 0m0.477s -sys 0m0.156s -Code generation completed in 2 seconds +real 0m2.175s +user 0m0.523s +sys 0m0.144s +Code generation completed in 3 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index 98bb331cee..45ea024451 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -1007,9 +1004,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1085,8 +1081,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1148,25 +1143,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1211,16 +1216,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1288,6 +1321,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1297,6 +1331,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1308,8 +1344,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1352,7 +1390,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1371,7 +1409,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1383,6 +1423,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1391,9 +1432,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1404,9 +1446,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1444,9 +1492,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1461,7 +1506,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1509,82 +1555,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1609,13 +1670,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1623,7 +1678,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h index 1b49cac30b..1b956214b7 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 951f0e3b1d..b697a4a0e0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -58,7 +58,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0018601417541503906  +DEBUG: model prefixing takes 0.0018012523651123047  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,17 +151,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.066 s +1 processes with 123 diagrams generated in 0.085 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  @@ -180,22 +180,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxgg DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1564]  DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1588]  DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1589]  -Generated helas calls for 1 subprocesses (123 diagrams) in 0.186 s -Wrote files for 222 helas calls in 1.983 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.154 s +Wrote files for 222 helas calls in 3.038 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.144 s +ALOHA: aloha creates 5 routines in 0.194 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.152 s +ALOHA: aloha creates 10 routines in 0.167 s VVV1 VVV1 FFV1 @@ -223,17 +223,17 @@ INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  +DEBUG: result.returncode =  0 [output.py at line 275]  Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m7.349s -user 0m1.941s -sys 0m0.599s -Code generation completed in 8 seconds +real 0m10.227s +user 0m2.050s +sys 0m0.643s +Code generation completed in 10 seconds /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index 8c0e33696c..ca0360d110 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -494,7 +491,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 VVV1_0( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -517,7 +514,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -540,7 +537,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 VVV1_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -564,7 +561,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -581,7 +578,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -600,7 +597,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -617,7 +614,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -634,7 +631,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -653,7 +650,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -670,7 +667,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -687,7 +684,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -706,7 +703,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[12] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -723,7 +720,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[13] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -742,7 +739,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[14] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -761,7 +758,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[15] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -782,7 +779,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[16] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -798,7 +795,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[17] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -814,7 +811,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[18] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -832,7 +829,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[19] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -851,7 +848,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[20] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -868,7 +865,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[21] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -885,7 +882,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 VVV1_0( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[22] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -904,7 +901,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[23] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -921,7 +918,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 FFV1_0( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[24] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -938,7 +935,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[25] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -954,7 +951,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[26] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -970,7 +967,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[27] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -986,7 +983,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[28] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1002,7 +999,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[29] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1019,7 +1016,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 VVV1_0( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[30] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1064,7 +1061,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[32] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1080,7 +1077,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 34 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[33] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1096,7 +1093,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 FFV1_0( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[34] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1113,7 +1110,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[35] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1132,7 +1129,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 37 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[36] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1149,7 +1146,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 38 FFV1_0( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[37] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1166,7 +1163,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 39 VVV1_0( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[38] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1185,7 +1182,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 40 FFV1_0( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[39] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1202,7 +1199,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 41 FFV1_0( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[40] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1219,7 +1216,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 42 FFV1_0( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[41] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1235,7 +1232,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 43 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[42] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1251,7 +1248,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 44 FFV1_0( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[43] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1267,7 +1264,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 45 FFV1_0( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[44] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1283,7 +1280,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 46 FFV1_0( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[45] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1300,7 +1297,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 47 VVV1_0( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[46] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1342,7 +1339,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 49 FFV1_0( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[48] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1359,7 +1356,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 50 FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[49] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1378,7 +1375,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 51 FFV1_0( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[50] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1395,7 +1392,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 52 FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[51] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1412,7 +1409,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 53 FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[52] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1431,7 +1428,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 54 FFV1_0( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[53] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1448,7 +1445,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 55 FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[54] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1467,7 +1464,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 56 FFV1_0( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[55] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1486,7 +1483,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 57 VVV1_0( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[56] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1543,7 +1540,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 59 VVV1_0( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[58] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1566,7 +1563,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 60 VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[59] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1589,7 +1586,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 61 FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[60] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1608,7 +1605,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 62 FFV1_0( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[61] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1625,7 +1622,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 63 FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[62] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1644,7 +1641,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 64 FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[63] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1662,7 +1659,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 65 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[64] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1679,7 +1676,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 66 FFV1_0( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[65] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1698,7 +1695,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 67 FFV1_0( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[66] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1715,7 +1712,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 68 FFV1_0( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[67] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1732,7 +1729,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 69 FFV1_0( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[68] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1751,7 +1748,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 70 FFV1_0( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[69] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1768,7 +1765,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 71 FFV1_0( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[70] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1787,7 +1784,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 72 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[71] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1806,7 +1803,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 73 VVV1_0( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[72] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1863,7 +1860,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 75 VVV1_0( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[74] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1886,7 +1883,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 76 VVV1_0( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[75] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1909,7 +1906,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 77 FFV1_0( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[76] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1928,7 +1925,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 78 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[77] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1945,7 +1942,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 79 FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[78] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1964,7 +1961,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 80 FFV1_0( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[79] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1981,7 +1978,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 81 FFV1_0( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[80] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1997,7 +1994,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 82 FFV1_0( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[81] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2013,7 +2010,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 83 FFV1_0( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[82] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2029,7 +2026,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 84 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[83] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2045,7 +2042,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 85 FFV1_0( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[84] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2062,7 +2059,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 86 FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[85] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2081,7 +2078,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 87 FFV1_0( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[86] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2097,7 +2094,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 88 FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[87] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2113,7 +2110,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 89 FFV1_0( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[88] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2129,7 +2126,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 90 FFV1_0( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[89] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2145,7 +2142,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 91 FFV1_0( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[90] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2162,7 +2159,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 92 FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[91] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2215,7 +2212,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 94 VVV1_0( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[93] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2238,7 +2235,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 95 VVV1_0( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[94] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2261,7 +2258,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 96 FFV1_0( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[95] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2280,7 +2277,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 97 FFV1_0( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[96] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2297,7 +2294,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 98 FFV1_0( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[97] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2316,7 +2313,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 99 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[98] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2367,7 +2364,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 101 VVV1_0( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[100] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2390,7 +2387,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 102 VVV1_0( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[101] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2413,7 +2410,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 103 FFV1_0( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[102] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2432,7 +2429,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 104 FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[103] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2449,7 +2446,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 105 FFV1_0( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[104] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2468,7 +2465,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 106 FFV1_0( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[105] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2519,7 +2516,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 108 VVV1_0( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[107] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2542,7 +2539,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 109 VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[108] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2565,7 +2562,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 110 FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[109] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2581,7 +2578,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 111 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[110] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2597,7 +2594,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 112 FFV1_0( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[111] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2613,7 +2610,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 113 FFV1_0( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[112] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3256,9 +3253,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -3334,8 +3330,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -3397,25 +3392,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -3460,16 +3465,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -3537,6 +3570,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -3546,6 +3580,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -3557,8 +3593,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -3601,7 +3639,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -3620,7 +3658,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -3632,6 +3672,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -3640,9 +3681,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -3653,9 +3695,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3693,9 +3741,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -3710,7 +3755,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -3758,82 +3804,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -3858,13 +3919,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -3872,7 +3927,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h index 7b57d7c763..e3c578f5e0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index ca2ea3a480..2cc4b19c4e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -58,7 +58,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0019006729125976562  +DEBUG: model prefixing takes 0.001790761947631836  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.086 s +1 processes with 123 diagrams generated in 0.083 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.192 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.152 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.157 s +ALOHA: aloha creates 5 routines in 0.173 s VVV1 VVV1 FFV1 @@ -200,7 +200,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m2.034s -user 0m0.812s -sys 0m0.125s -Code generation completed in 2 seconds +real 0m2.439s +user 0m0.757s +sys 0m0.131s +Code generation completed in 3 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index 203de90b29..d9dc04eb8b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -2998,9 +2995,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -3076,8 +3072,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -3139,25 +3134,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -3202,16 +3207,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -3279,6 +3312,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -3288,6 +3322,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -3299,8 +3335,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -3343,7 +3381,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -3362,7 +3400,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -3374,6 +3414,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -3382,9 +3423,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -3395,9 +3437,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3435,9 +3483,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -3452,7 +3497,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -3500,82 +3546,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -3600,13 +3661,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -3614,7 +3669,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h index 7b57d7c763..e3c578f5e0 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 47e89cf5f5..da1647639c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -58,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0018525123596191406  +DEBUG: model prefixing takes 0.0017483234405517578  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,17 +151,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 0.699 s +1 processes with 1240 diagrams generated in 0.713 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  @@ -182,22 +182,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxggg DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1564]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1588]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1589]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 2.548 s -Wrote files for 2281 helas calls in 25.269 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 2.518 s +Wrote files for 2281 helas calls in 34.890 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.176 s +ALOHA: aloha creates 5 routines in 0.190 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.172 s +ALOHA: aloha creates 10 routines in 0.181 s VVV1 VVV1 FFV1 @@ -225,17 +225,17 @@ INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  +DEBUG: result.returncode =  0 [output.py at line 275]  Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m36.059s -user 0m13.905s -sys 0m1.219s -Code generation completed in 36 seconds +real 0m46.871s +user 0m13.972s +sys 0m1.182s +Code generation completed in 46 seconds /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index f9ae4dcde8..e9c80d8364 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -464,7 +461,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 VVV1_0( w_fp[9], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -495,7 +492,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 VVV1_0( w_fp[9], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -585,7 +582,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 VVV1_0( w_fp[12], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -616,7 +613,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[12], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -705,7 +702,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 VVV1_0( w_fp[14], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -736,7 +733,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 VVV1_0( w_fp[14], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1063,7 +1060,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 VVV1_0( w_fp[24], w_fp[6], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[13] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1094,7 +1091,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[8], w_fp[6], w_fp[26], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[14] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1125,7 +1122,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 VVV1_0( w_fp[8], w_fp[24], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[15] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1214,7 +1211,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 VVV1_0( w_fp[27], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[17] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1245,7 +1242,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 VVV1_0( w_fp[8], w_fp[5], w_fp[28], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[18] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1276,7 +1273,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[8], w_fp[27], w_fp[12], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[19] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1365,7 +1362,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 VVV1_0( w_fp[4], w_fp[29], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[21] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1396,7 +1393,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 VVV1_0( w_fp[8], w_fp[29], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[22] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1427,7 +1424,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 VVV1_0( w_fp[8], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[23] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1520,7 +1517,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[34], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[25] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1537,7 +1534,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[34], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[26] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1554,7 +1551,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 VVV1_0( w_fp[12], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[27] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1577,7 +1574,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[3], w_fp[36], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[28] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1596,7 +1593,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 VVV1_0( w_fp[14], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[29] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1619,7 +1616,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[3], w_fp[35], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[30] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1673,7 +1670,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[38], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[32] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1690,7 +1687,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 34 FFV1_0( w_fp[40], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[33] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1707,7 +1704,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 FFV1_0( w_fp[38], w_fp[33], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[34] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1726,7 +1723,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 FFV1_0( w_fp[41], w_fp[39], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[35] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1743,7 +1740,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 37 FFV1_0( w_fp[42], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[36] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1760,7 +1757,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 38 FFV1_0( w_fp[41], w_fp[33], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[37] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1779,7 +1776,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 39 FFV1_0( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[38] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1798,7 +1795,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 40 FFV1_0( w_fp[34], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[39] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1817,7 +1814,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 41 FFV1_0( w_fp[3], w_fp[33], w_fp[25], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[40] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1841,7 +1838,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 42 FFV1_0( w_fp[34], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[41] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1858,7 +1855,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 43 FFV1_0( w_fp[34], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[42] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1875,7 +1872,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 44 VVV1_0( w_fp[9], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[43] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1898,7 +1895,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 45 FFV1_0( w_fp[3], w_fp[44], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[44] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1917,7 +1914,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 46 VVV1_0( w_fp[14], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[45] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1940,7 +1937,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 47 FFV1_0( w_fp[3], w_fp[43], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[46] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1994,7 +1991,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 49 FFV1_0( w_fp[46], w_fp[47], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[48] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2011,7 +2008,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 50 FFV1_0( w_fp[48], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[49] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2028,7 +2025,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 51 FFV1_0( w_fp[46], w_fp[39], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[50] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2047,7 +2044,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 52 FFV1_0( w_fp[41], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[51] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2064,7 +2061,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 53 FFV1_0( w_fp[42], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[52] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2081,7 +2078,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 54 FFV1_0( w_fp[41], w_fp[39], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[53] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2100,7 +2097,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 55 FFV1_0( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[54] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2119,7 +2116,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 56 FFV1_0( w_fp[34], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[55] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2138,7 +2135,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 57 FFV1_0( w_fp[3], w_fp[39], w_fp[28], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[56] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2162,7 +2159,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 58 FFV1_0( w_fp[34], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[57] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2179,7 +2176,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 59 FFV1_0( w_fp[34], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[58] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2196,7 +2193,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 60 VVV1_0( w_fp[9], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[59] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2219,7 +2216,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 61 FFV1_0( w_fp[3], w_fp[50], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[60] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2238,7 +2235,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 62 VVV1_0( w_fp[12], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[61] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2261,7 +2258,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 63 FFV1_0( w_fp[3], w_fp[49], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[62] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2314,7 +2311,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 65 FFV1_0( w_fp[46], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[64] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2331,7 +2328,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 66 FFV1_0( w_fp[48], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[65] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2348,7 +2345,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 67 FFV1_0( w_fp[46], w_fp[47], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[66] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2367,7 +2364,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 68 FFV1_0( w_fp[38], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[67] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2384,7 +2381,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 69 FFV1_0( w_fp[40], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[68] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2401,7 +2398,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 70 FFV1_0( w_fp[38], w_fp[47], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[69] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2420,7 +2417,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 71 FFV1_0( w_fp[3], w_fp[52], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[70] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2439,7 +2436,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 72 FFV1_0( w_fp[34], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[71] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2458,7 +2455,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 73 FFV1_0( w_fp[3], w_fp[47], w_fp[26], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[72] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2482,7 +2479,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 74 FFV1_0( w_fp[7], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[73] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2499,7 +2496,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 75 FFV1_0( w_fp[53], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[74] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2516,7 +2513,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 76 VVV1_0( w_fp[12], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[75] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2539,7 +2536,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 77 FFV1_0( w_fp[53], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[76] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2558,7 +2555,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 78 VVV1_0( w_fp[14], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[77] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2581,7 +2578,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 79 FFV1_0( w_fp[7], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[78] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2634,7 +2631,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 81 FFV1_0( w_fp[46], w_fp[52], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[80] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2653,7 +2650,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 82 FFV1_0( w_fp[48], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[81] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2672,7 +2669,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 83 FFV1_0( w_fp[46], w_fp[2], w_fp[25], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[82] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2695,7 +2692,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 84 FFV1_0( w_fp[25], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[83] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2712,7 +2709,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 85 FFV1_0( w_fp[48], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[84] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2729,7 +2726,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 86 VVV1_0( w_fp[9], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[85] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2752,7 +2749,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 87 FFV1_0( w_fp[48], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[86] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2771,7 +2768,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 88 VVV1_0( w_fp[14], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[87] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2794,7 +2791,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 89 FFV1_0( w_fp[25], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[88] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2847,7 +2844,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 91 FFV1_0( w_fp[38], w_fp[52], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[90] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2866,7 +2863,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 92 FFV1_0( w_fp[40], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[91] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2885,7 +2882,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 93 FFV1_0( w_fp[38], w_fp[2], w_fp[28], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[92] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2908,7 +2905,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 94 FFV1_0( w_fp[28], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[93] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2925,7 +2922,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 95 FFV1_0( w_fp[40], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[94] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2942,7 +2939,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 96 VVV1_0( w_fp[9], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[95] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2965,7 +2962,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 97 FFV1_0( w_fp[40], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[96] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2984,7 +2981,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 98 VVV1_0( w_fp[12], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[97] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3007,7 +3004,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 99 FFV1_0( w_fp[28], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[98] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3060,7 +3057,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 101 FFV1_0( w_fp[41], w_fp[52], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[100] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3079,7 +3076,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 102 FFV1_0( w_fp[42], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[101] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3098,7 +3095,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 103 FFV1_0( w_fp[41], w_fp[2], w_fp[26], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[102] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3121,7 +3118,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 104 FFV1_0( w_fp[26], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[103] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3140,7 +3137,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 105 FFV1_0( w_fp[3], w_fp[52], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[104] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3163,7 +3160,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 106 FFV1_0( w_fp[34], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[105] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3182,7 +3179,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 107 FFV1_0( w_fp[34], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[106] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3205,7 +3202,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 108 FFV1_0( w_fp[3], w_fp[17], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[107] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3228,7 +3225,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 109 FFV1_0( w_fp[26], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[108] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3251,7 +3248,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 110 FFV1_0( w_fp[14], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[109] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3270,7 +3267,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 111 FFV1_0( w_fp[3], w_fp[52], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[110] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3293,7 +3290,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 112 FFV1_0( w_fp[34], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[111] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3312,7 +3309,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 113 FFV1_0( w_fp[34], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[112] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3335,7 +3332,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 114 FFV1_0( w_fp[3], w_fp[15], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[113] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3358,7 +3355,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 115 FFV1_0( w_fp[14], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[114] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3381,7 +3378,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 116 FFV1_0( w_fp[12], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[115] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3400,7 +3397,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 117 FFV1_0( w_fp[3], w_fp[52], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[116] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3423,7 +3420,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 118 FFV1_0( w_fp[34], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[117] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3442,7 +3439,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 119 FFV1_0( w_fp[34], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[118] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3465,7 +3462,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 120 FFV1_0( w_fp[3], w_fp[18], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[119] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3488,7 +3485,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 121 FFV1_0( w_fp[12], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[120] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3582,7 +3579,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 124 FFV1_0( w_fp[22], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[123] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3598,7 +3595,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 125 FFV1_0( w_fp[21], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[124] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3615,7 +3612,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 126 FFV1_0( w_fp[56], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[125] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3631,7 +3628,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 127 FFV1_0( w_fp[21], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[126] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3647,7 +3644,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 128 FFV1_0( w_fp[56], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[127] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3663,7 +3660,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 129 FFV1_0( w_fp[22], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[128] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3679,7 +3676,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 130 VVV1_0( w_fp[24], w_fp[6], w_fp[58], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[129] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3698,7 +3695,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 131 FFV1_0( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[130] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3715,7 +3712,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 132 FFV1_0( w_fp[52], w_fp[57], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[131] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3732,7 +3729,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 133 VVV1_0( w_fp[27], w_fp[5], w_fp[58], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[132] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3751,7 +3748,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 134 FFV1_0( w_fp[52], w_fp[60], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[133] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3768,7 +3765,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 135 FFV1_0( w_fp[52], w_fp[55], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[134] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3785,7 +3782,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 136 VVV1_0( w_fp[4], w_fp[29], w_fp[58], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[135] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3804,7 +3801,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 137 FFV1_0( w_fp[52], w_fp[9], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[136] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3821,7 +3818,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 138 FFV1_0( w_fp[52], w_fp[58], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[137] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3862,7 +3859,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 140 VVV1_0( w_fp[62], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[139] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3885,7 +3882,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 141 VVV1_0( w_fp[62], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[140] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3942,7 +3939,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 143 FFV1_0( w_fp[65], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[142] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3959,7 +3956,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 144 FFV1_0( w_fp[3], w_fp[55], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[143] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3978,7 +3975,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 145 FFV1_0( w_fp[65], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[144] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3995,7 +3992,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 146 FFV1_0( w_fp[3], w_fp[57], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[145] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4014,7 +4011,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 147 FFV1_0( w_fp[38], w_fp[66], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[146] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4031,7 +4028,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 148 VVV1_0( w_fp[61], w_fp[6], w_fp[67], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[147] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4050,7 +4047,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 149 FFV1_0( w_fp[38], w_fp[57], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[148] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4067,7 +4064,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 150 FFV1_0( w_fp[41], w_fp[66], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[149] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4084,7 +4081,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 151 VVV1_0( w_fp[61], w_fp[5], w_fp[68], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[150] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4103,7 +4100,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 152 FFV1_0( w_fp[41], w_fp[55], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[151] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4120,7 +4117,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 153 FFV1_0( w_fp[3], w_fp[66], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[152] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4139,7 +4136,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 154 VVV1_0( w_fp[61], w_fp[29], w_fp[62], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[153] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4162,7 +4159,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 155 FFV1_0( w_fp[3], w_fp[58], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[154] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4182,7 +4179,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 156 VVV1_0( w_fp[62], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[155] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4205,7 +4202,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 157 VVV1_0( w_fp[62], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[156] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4262,7 +4259,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 159 FFV1_0( w_fp[71], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[158] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4279,7 +4276,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 160 FFV1_0( w_fp[3], w_fp[9], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[159] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4298,7 +4295,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 161 FFV1_0( w_fp[71], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[160] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4315,7 +4312,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 162 FFV1_0( w_fp[3], w_fp[57], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[161] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4334,7 +4331,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 163 FFV1_0( w_fp[46], w_fp[72], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[162] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4351,7 +4348,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 164 VVV1_0( w_fp[66], w_fp[6], w_fp[73], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[163] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4370,7 +4367,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 165 FFV1_0( w_fp[46], w_fp[57], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[164] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4387,7 +4384,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 166 FFV1_0( w_fp[41], w_fp[72], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[165] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4404,7 +4401,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 167 VVV1_0( w_fp[66], w_fp[4], w_fp[68], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[166] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4423,7 +4420,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 168 FFV1_0( w_fp[41], w_fp[9], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[167] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4440,7 +4437,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 169 FFV1_0( w_fp[3], w_fp[72], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[168] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4459,7 +4456,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 170 VVV1_0( w_fp[66], w_fp[27], w_fp[62], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[169] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4482,7 +4479,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 171 FFV1_0( w_fp[3], w_fp[60], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[170] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4502,7 +4499,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 172 VVV1_0( w_fp[62], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[171] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4525,7 +4522,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 173 VVV1_0( w_fp[62], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[172] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4582,7 +4579,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 175 FFV1_0( w_fp[76], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[174] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4599,7 +4596,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 176 FFV1_0( w_fp[3], w_fp[9], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[175] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4618,7 +4615,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 177 FFV1_0( w_fp[76], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[176] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4635,7 +4632,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 178 FFV1_0( w_fp[3], w_fp[55], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[177] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4654,7 +4651,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 179 FFV1_0( w_fp[46], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[178] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4671,7 +4668,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 180 VVV1_0( w_fp[72], w_fp[5], w_fp[73], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[179] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4690,7 +4687,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 181 FFV1_0( w_fp[46], w_fp[55], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[180] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4707,7 +4704,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 182 FFV1_0( w_fp[38], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[181] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4724,7 +4721,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 183 VVV1_0( w_fp[72], w_fp[4], w_fp[67], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[182] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4743,7 +4740,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 184 FFV1_0( w_fp[38], w_fp[9], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[183] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4760,7 +4757,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 185 FFV1_0( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[184] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4779,7 +4776,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 186 VVV1_0( w_fp[72], w_fp[24], w_fp[62], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[185] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4802,7 +4799,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 187 FFV1_0( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[186] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4821,7 +4818,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 188 FFV1_0( w_fp[7], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[187] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4837,7 +4834,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 189 FFV1_0( w_fp[53], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[188] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4853,7 +4850,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 190 FFV1_0( w_fp[78], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[189] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4869,7 +4866,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 191 FFV1_0( w_fp[53], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[190] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4885,7 +4882,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 192 FFV1_0( w_fp[78], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[191] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4901,7 +4898,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 193 FFV1_0( w_fp[7], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[192] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4917,7 +4914,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 194 FFV1_0( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[193] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4934,7 +4931,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 195 VVV1_0( w_fp[1], w_fp[29], w_fp[73], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[194] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4953,7 +4950,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 196 FFV1_0( w_fp[46], w_fp[58], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[195] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4970,7 +4967,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 197 FFV1_0( w_fp[25], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[196] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -4986,7 +4983,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 198 FFV1_0( w_fp[48], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[197] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5002,7 +4999,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 199 FFV1_0( w_fp[58], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[198] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5018,7 +5015,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 200 FFV1_0( w_fp[48], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[199] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5034,7 +5031,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 201 FFV1_0( w_fp[58], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[200] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5050,7 +5047,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 202 FFV1_0( w_fp[25], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[201] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5066,7 +5063,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 203 FFV1_0( w_fp[38], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[202] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5083,7 +5080,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 204 VVV1_0( w_fp[1], w_fp[27], w_fp[67], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[203] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5102,7 +5099,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 205 FFV1_0( w_fp[38], w_fp[60], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[204] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5119,7 +5116,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 206 FFV1_0( w_fp[28], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[205] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5135,7 +5132,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 207 FFV1_0( w_fp[40], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[206] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5151,7 +5148,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 208 FFV1_0( w_fp[60], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[207] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5167,7 +5164,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 209 FFV1_0( w_fp[40], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[208] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5183,7 +5180,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 210 FFV1_0( w_fp[60], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[209] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5199,7 +5196,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 211 FFV1_0( w_fp[28], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[210] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5215,7 +5212,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 212 FFV1_0( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[211] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5232,7 +5229,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 213 VVV1_0( w_fp[1], w_fp[24], w_fp[68], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[212] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5251,7 +5248,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 214 FFV1_0( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[213] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5268,7 +5265,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 215 FFV1_0( w_fp[26], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[214] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5285,7 +5282,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 216 FFV1_0( w_fp[3], w_fp[77], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[215] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5304,7 +5301,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 217 VVV1_0( w_fp[62], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[216] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5327,7 +5324,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 218 VVV1_0( w_fp[62], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[217] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5384,7 +5381,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 220 FFV1_0( w_fp[3], w_fp[57], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[219] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5403,7 +5400,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 221 FFV1_0( w_fp[26], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[220] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5420,7 +5417,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 222 FFV1_0( w_fp[14], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[221] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5437,7 +5434,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 223 FFV1_0( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[222] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5456,7 +5453,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 224 VVV1_0( w_fp[62], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[223] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5479,7 +5476,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 225 VVV1_0( w_fp[62], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[224] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5536,7 +5533,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 227 FFV1_0( w_fp[3], w_fp[55], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[226] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5555,7 +5552,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 228 FFV1_0( w_fp[14], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[227] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5572,7 +5569,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 229 FFV1_0( w_fp[12], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[228] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5589,7 +5586,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 230 FFV1_0( w_fp[3], w_fp[77], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[229] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5608,7 +5605,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 231 VVV1_0( w_fp[62], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[230] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5631,7 +5628,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 232 VVV1_0( w_fp[62], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[231] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5688,7 +5685,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 234 FFV1_0( w_fp[3], w_fp[9], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[233] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -5707,7 +5704,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 235 FFV1_0( w_fp[12], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[234] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6023,7 +6020,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 247 FFV1_0( w_fp[34], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[246] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6039,7 +6036,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 248 FFV1_0( w_fp[34], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[247] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6056,7 +6053,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 249 FFV1_0( w_fp[86], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[248] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6072,7 +6069,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 250 FFV1_0( w_fp[86], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[249] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6088,7 +6085,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 251 FFV1_0( w_fp[88], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[250] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6104,7 +6101,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 252 FFV1_0( w_fp[88], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[251] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6120,7 +6117,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 253 VVV1_0( w_fp[24], w_fp[6], w_fp[89], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[252] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6139,7 +6136,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 254 FFV1_0( w_fp[90], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[253] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6156,7 +6153,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 255 FFV1_0( w_fp[88], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[254] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6173,7 +6170,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 256 VVV1_0( w_fp[27], w_fp[5], w_fp[89], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[255] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6192,7 +6189,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 257 FFV1_0( w_fp[91], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[256] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6209,7 +6206,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 258 FFV1_0( w_fp[86], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[257] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6226,7 +6223,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 259 VVV1_0( w_fp[4], w_fp[29], w_fp[89], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[258] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6245,7 +6242,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 260 FFV1_0( w_fp[34], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[259] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6262,7 +6259,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 261 FFV1_0( w_fp[89], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[260] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6301,7 +6298,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 263 VVV1_0( w_fp[92], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[262] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6324,7 +6321,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 264 VVV1_0( w_fp[92], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[263] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6381,7 +6378,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 266 FFV1_0( w_fp[86], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[265] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6398,7 +6395,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 267 FFV1_0( w_fp[86], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[266] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6417,7 +6414,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 268 FFV1_0( w_fp[88], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[267] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6434,7 +6431,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 269 FFV1_0( w_fp[88], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[268] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6453,7 +6450,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 270 FFV1_0( w_fp[94], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[269] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6470,7 +6467,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 271 VVV1_0( w_fp[61], w_fp[6], w_fp[95], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[270] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6489,7 +6486,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 272 FFV1_0( w_fp[88], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[271] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6506,7 +6503,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 273 FFV1_0( w_fp[94], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[272] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6523,7 +6520,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 274 VVV1_0( w_fp[61], w_fp[5], w_fp[96], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[273] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6542,7 +6539,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 275 FFV1_0( w_fp[86], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[274] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6559,7 +6556,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 276 FFV1_0( w_fp[94], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[275] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6578,7 +6575,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 277 VVV1_0( w_fp[61], w_fp[29], w_fp[92], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[276] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6601,7 +6598,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 278 FFV1_0( w_fp[89], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[277] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6620,7 +6617,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 279 VVV1_0( w_fp[92], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[278] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6643,7 +6640,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 280 VVV1_0( w_fp[92], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[279] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6700,7 +6697,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 282 FFV1_0( w_fp[34], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[281] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6717,7 +6714,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 283 FFV1_0( w_fp[34], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[282] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6736,7 +6733,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 284 FFV1_0( w_fp[88], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[283] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6753,7 +6750,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 285 FFV1_0( w_fp[88], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[284] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6772,7 +6769,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 286 FFV1_0( w_fp[97], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[285] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6789,7 +6786,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 287 VVV1_0( w_fp[66], w_fp[6], w_fp[98], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[286] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6808,7 +6805,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 288 FFV1_0( w_fp[88], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[287] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6825,7 +6822,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 289 FFV1_0( w_fp[97], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[288] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6842,7 +6839,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 290 VVV1_0( w_fp[66], w_fp[4], w_fp[96], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[289] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6861,7 +6858,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 291 FFV1_0( w_fp[34], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[290] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6878,7 +6875,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 292 FFV1_0( w_fp[97], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[291] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6897,7 +6894,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 293 VVV1_0( w_fp[66], w_fp[27], w_fp[92], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[292] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6920,7 +6917,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 294 FFV1_0( w_fp[91], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[293] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6939,7 +6936,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 295 VVV1_0( w_fp[92], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[294] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -6962,7 +6959,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 296 VVV1_0( w_fp[92], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[295] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7019,7 +7016,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 298 FFV1_0( w_fp[34], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[297] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7036,7 +7033,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 299 FFV1_0( w_fp[34], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[298] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7055,7 +7052,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 300 FFV1_0( w_fp[86], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[299] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7072,7 +7069,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 301 FFV1_0( w_fp[86], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[300] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7091,7 +7088,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 302 FFV1_0( w_fp[99], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[301] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7108,7 +7105,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 303 VVV1_0( w_fp[72], w_fp[5], w_fp[98], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[302] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7127,7 +7124,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 304 FFV1_0( w_fp[86], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[303] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7144,7 +7141,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 305 FFV1_0( w_fp[99], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[304] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7161,7 +7158,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 306 VVV1_0( w_fp[72], w_fp[4], w_fp[95], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[305] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7180,7 +7177,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 307 FFV1_0( w_fp[34], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[306] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7197,7 +7194,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 308 FFV1_0( w_fp[99], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[307] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7216,7 +7213,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 309 VVV1_0( w_fp[72], w_fp[24], w_fp[92], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[308] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7239,7 +7236,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 310 FFV1_0( w_fp[90], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[309] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7258,7 +7255,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 311 FFV1_0( w_fp[99], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[310] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7274,7 +7271,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 312 FFV1_0( w_fp[99], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[311] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7290,7 +7287,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 313 FFV1_0( w_fp[86], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[312] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7306,7 +7303,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 314 FFV1_0( w_fp[86], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[313] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7322,7 +7319,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 315 FFV1_0( w_fp[88], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[314] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7338,7 +7335,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 316 FFV1_0( w_fp[88], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[315] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7354,7 +7351,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 317 FFV1_0( w_fp[99], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[316] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7371,7 +7368,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 318 VVV1_0( w_fp[1], w_fp[29], w_fp[98], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[317] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7390,7 +7387,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 319 FFV1_0( w_fp[89], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[318] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7407,7 +7404,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 320 FFV1_0( w_fp[99], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[319] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7423,7 +7420,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 321 FFV1_0( w_fp[99], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[320] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7439,7 +7436,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 322 FFV1_0( w_fp[34], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[321] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7455,7 +7452,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 323 FFV1_0( w_fp[34], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[322] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7471,7 +7468,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 324 FFV1_0( w_fp[88], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[323] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7487,7 +7484,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 325 FFV1_0( w_fp[88], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[324] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7503,7 +7500,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 326 FFV1_0( w_fp[99], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[325] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7520,7 +7517,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 327 VVV1_0( w_fp[1], w_fp[27], w_fp[95], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[326] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7539,7 +7536,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 328 FFV1_0( w_fp[91], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[327] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7556,7 +7553,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 329 FFV1_0( w_fp[99], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[328] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7572,7 +7569,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 330 FFV1_0( w_fp[99], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[329] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7588,7 +7585,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 331 FFV1_0( w_fp[34], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[330] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7604,7 +7601,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 332 FFV1_0( w_fp[34], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[331] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7620,7 +7617,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 333 FFV1_0( w_fp[86], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[332] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7636,7 +7633,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 334 FFV1_0( w_fp[86], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[333] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7652,7 +7649,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 335 FFV1_0( w_fp[99], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[334] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7669,7 +7666,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 336 VVV1_0( w_fp[1], w_fp[24], w_fp[96], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[335] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7688,7 +7685,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 337 FFV1_0( w_fp[90], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[336] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7705,7 +7702,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 338 FFV1_0( w_fp[99], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[337] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7722,7 +7719,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 339 FFV1_0( w_fp[99], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[338] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7741,7 +7738,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 340 VVV1_0( w_fp[92], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[339] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7764,7 +7761,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 341 VVV1_0( w_fp[92], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[340] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7821,7 +7818,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 343 FFV1_0( w_fp[88], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[342] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7840,7 +7837,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 344 FFV1_0( w_fp[88], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[343] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7857,7 +7854,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 345 FFV1_0( w_fp[99], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[344] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7874,7 +7871,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 346 FFV1_0( w_fp[99], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[345] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7893,7 +7890,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 347 VVV1_0( w_fp[92], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[346] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7916,7 +7913,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 348 VVV1_0( w_fp[92], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[347] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7973,7 +7970,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 350 FFV1_0( w_fp[86], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[349] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -7992,7 +7989,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 351 FFV1_0( w_fp[86], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[350] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8009,7 +8006,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 352 FFV1_0( w_fp[99], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[351] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8026,7 +8023,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 353 FFV1_0( w_fp[99], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[352] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8045,7 +8042,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 354 VVV1_0( w_fp[92], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[353] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8068,7 +8065,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 355 VVV1_0( w_fp[92], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[354] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8125,7 +8122,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 357 FFV1_0( w_fp[34], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[356] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8144,7 +8141,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 358 FFV1_0( w_fp[34], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[357] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8452,7 +8449,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 370 FFV1_0( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[369] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8469,7 +8466,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 371 FFV1_0( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[370] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8487,7 +8484,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 372 VVV1_0( w_fp[62], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[371] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8510,7 +8507,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 373 FFV1_0( w_fp[3], w_fp[85], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[372] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8529,7 +8526,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 374 VVV1_0( w_fp[86], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[373] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8552,7 +8549,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 375 FFV1_0( w_fp[3], w_fp[9], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[374] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8607,7 +8604,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 377 FFV1_0( w_fp[38], w_fp[95], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[376] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8624,7 +8621,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 378 FFV1_0( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[377] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8641,7 +8638,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 379 FFV1_0( w_fp[38], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[378] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8660,7 +8657,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 380 FFV1_0( w_fp[41], w_fp[95], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[379] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8677,7 +8674,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 381 FFV1_0( w_fp[101], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[380] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8694,7 +8691,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 382 FFV1_0( w_fp[41], w_fp[77], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[381] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8713,7 +8710,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 383 FFV1_0( w_fp[3], w_fp[95], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[382] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8732,7 +8729,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 384 FFV1_0( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[383] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8751,7 +8748,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 385 FFV1_0( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[384] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8774,7 +8771,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 386 FFV1_0( w_fp[22], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[385] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8791,7 +8788,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 387 FFV1_0( w_fp[21], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[386] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8808,7 +8805,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 388 VVV1_0( w_fp[62], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[387] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8831,7 +8828,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 389 FFV1_0( w_fp[21], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[388] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8850,7 +8847,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 390 VVV1_0( w_fp[86], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[389] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8873,7 +8870,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 391 FFV1_0( w_fp[22], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[390] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8926,7 +8923,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 393 FFV1_0( w_fp[104], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[392] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8943,7 +8940,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 394 FFV1_0( w_fp[52], w_fp[105], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[393] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8960,7 +8957,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 395 FFV1_0( w_fp[52], w_fp[39], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[394] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8979,7 +8976,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 396 FFV1_0( w_fp[104], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[395] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -8996,7 +8993,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 397 FFV1_0( w_fp[52], w_fp[106], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[396] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9013,7 +9010,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 398 FFV1_0( w_fp[52], w_fp[47], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[397] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9032,7 +9029,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 399 FFV1_0( w_fp[104], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[398] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9051,7 +9048,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 400 FFV1_0( w_fp[52], w_fp[102], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[399] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9070,7 +9067,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 401 FFV1_0( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[400] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9093,7 +9090,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 402 FFV1_0( w_fp[71], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[401] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9112,7 +9109,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 403 FFV1_0( w_fp[3], w_fp[102], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[402] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9135,7 +9132,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 404 FFV1_0( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[403] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9154,7 +9151,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 405 FFV1_0( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[404] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9177,7 +9174,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 406 FFV1_0( w_fp[3], w_fp[94], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[405] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9200,7 +9197,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 407 FFV1_0( w_fp[71], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[406] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9281,7 +9278,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 409 VVV1_0( w_fp[8], w_fp[6], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[408] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9312,7 +9309,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 410 VVV1_0( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[409] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9343,7 +9340,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 411 VVV1_0( w_fp[66], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[410] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9374,7 +9371,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 412 FFV1_0( w_fp[3], w_fp[47], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[411] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9397,7 +9394,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 413 FFV1_0( w_fp[3], w_fp[106], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[412] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9416,7 +9413,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 414 FFV1_0( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[413] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9435,7 +9432,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 415 FFV1_0( w_fp[41], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[414] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9458,7 +9455,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 416 FFV1_0( w_fp[41], w_fp[102], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[415] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9477,7 +9474,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 417 FFV1_0( w_fp[101], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[416] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9496,7 +9493,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 418 FFV1_0( w_fp[76], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[417] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9515,7 +9512,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 419 FFV1_0( w_fp[3], w_fp[102], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[418] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9538,7 +9535,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 420 FFV1_0( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[419] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9557,7 +9554,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 421 FFV1_0( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[420] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9580,7 +9577,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 422 FFV1_0( w_fp[3], w_fp[97], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[421] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9603,7 +9600,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 423 FFV1_0( w_fp[76], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[422] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9684,7 +9681,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 425 VVV1_0( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[424] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9715,7 +9712,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 426 VVV1_0( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[425] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9746,7 +9743,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 427 VVV1_0( w_fp[72], w_fp[8], w_fp[62], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[426] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9777,7 +9774,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 428 FFV1_0( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[427] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9800,7 +9797,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 429 FFV1_0( w_fp[3], w_fp[105], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[428] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9819,7 +9816,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 430 FFV1_0( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[429] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9838,7 +9835,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 431 FFV1_0( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[430] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9861,7 +9858,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 432 FFV1_0( w_fp[38], w_fp[102], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[431] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9880,7 +9877,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 433 FFV1_0( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[432] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9899,7 +9896,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 434 VVV1_0( w_fp[104], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[433] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -9930,7 +9927,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 435 VVV1_0( w_fp[104], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[434] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10019,7 +10016,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 437 VVV1_0( w_fp[62], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[436] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10050,7 +10047,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 438 VVV1_0( w_fp[62], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[437] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10139,7 +10136,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 440 VVV1_0( w_fp[86], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[439] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10170,7 +10167,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 441 VVV1_0( w_fp[86], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[440] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10495,7 +10492,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 447 VVV1_0( w_fp[8], w_fp[29], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[446] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10526,7 +10523,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 448 VVV1_0( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[447] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10557,7 +10554,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 449 VVV1_0( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[448] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10588,7 +10585,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 450 VVV1_0( w_fp[104], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[449] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10611,7 +10608,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 451 FFV1_0( w_fp[3], w_fp[44], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[450] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10630,7 +10627,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 452 FFV1_0( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[451] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10647,7 +10644,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 453 FFV1_0( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[452] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10664,7 +10661,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 454 FFV1_0( w_fp[3], w_fp[89], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[453] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10683,7 +10680,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 455 VVV1_0( w_fp[86], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[454] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10740,7 +10737,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 457 FFV1_0( w_fp[41], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[456] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10759,7 +10756,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 458 FFV1_0( w_fp[41], w_fp[105], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[457] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10776,7 +10773,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 459 FFV1_0( w_fp[101], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[458] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10793,7 +10790,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 460 VVV1_0( w_fp[104], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[459] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10816,7 +10813,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 461 FFV1_0( w_fp[3], w_fp[50], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[460] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10835,7 +10832,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 462 FFV1_0( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[461] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10852,7 +10849,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 463 FFV1_0( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[462] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10869,7 +10866,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 464 FFV1_0( w_fp[3], w_fp[91], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[463] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10888,7 +10885,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 465 VVV1_0( w_fp[62], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[464] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10945,7 +10942,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 467 FFV1_0( w_fp[38], w_fp[47], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[466] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10964,7 +10961,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 468 FFV1_0( w_fp[38], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[467] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10981,7 +10978,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 469 FFV1_0( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[468] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -10998,7 +10995,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 470 VVV1_0( w_fp[104], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[469] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11021,7 +11018,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 471 FFV1_0( w_fp[48], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[470] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11040,7 +11037,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 472 FFV1_0( w_fp[58], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[471] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11057,7 +11054,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 473 FFV1_0( w_fp[48], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[472] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11074,7 +11071,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 474 FFV1_0( w_fp[58], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[473] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11093,7 +11090,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 475 VVV1_0( w_fp[86], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[474] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11150,7 +11147,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 477 VVV1_0( w_fp[104], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[476] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11173,7 +11170,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 478 FFV1_0( w_fp[40], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[477] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11192,7 +11189,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 479 FFV1_0( w_fp[60], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[478] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11209,7 +11206,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 480 FFV1_0( w_fp[40], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[479] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11226,7 +11223,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 481 FFV1_0( w_fp[60], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[480] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11245,7 +11242,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 482 VVV1_0( w_fp[62], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[481] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11302,7 +11299,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 484 FFV1_0( w_fp[3], w_fp[18], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[483] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11325,7 +11322,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 485 FFV1_0( w_fp[12], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[484] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11348,7 +11345,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 486 FFV1_0( w_fp[3], w_fp[102], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[485] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11371,7 +11368,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 487 FFV1_0( w_fp[12], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[486] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11390,7 +11387,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 488 FFV1_0( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[487] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11413,7 +11410,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 489 FFV1_0( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[488] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11559,7 +11556,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 493 FFV1_0( w_fp[99], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[492] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11576,7 +11573,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 494 FFV1_0( w_fp[99], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[493] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11593,7 +11590,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 495 VVV1_0( w_fp[102], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[494] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11616,7 +11613,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 496 FFV1_0( w_fp[3], w_fp[85], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[495] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11635,7 +11632,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 497 VVV1_0( w_fp[104], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[496] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11658,7 +11655,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 498 FFV1_0( w_fp[3], w_fp[87], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[497] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11713,7 +11710,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 500 FFV1_0( w_fp[46], w_fp[62], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[499] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11730,7 +11727,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 501 FFV1_0( w_fp[114], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[500] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11747,7 +11744,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 502 FFV1_0( w_fp[46], w_fp[77], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[501] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11766,7 +11763,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 503 FFV1_0( w_fp[41], w_fp[62], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[502] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11783,7 +11780,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 504 FFV1_0( w_fp[113], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[503] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11800,7 +11797,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 505 FFV1_0( w_fp[41], w_fp[77], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[504] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11819,7 +11816,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 506 FFV1_0( w_fp[3], w_fp[62], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[505] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11838,7 +11835,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 507 FFV1_0( w_fp[99], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[506] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11857,7 +11854,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 508 FFV1_0( w_fp[3], w_fp[77], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[507] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11880,7 +11877,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 509 FFV1_0( w_fp[56], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[508] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11897,7 +11894,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 510 FFV1_0( w_fp[21], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[509] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11914,7 +11911,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 511 VVV1_0( w_fp[102], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[510] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11937,7 +11934,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 512 FFV1_0( w_fp[21], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[511] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11956,7 +11953,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 513 VVV1_0( w_fp[104], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[512] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -11979,7 +11976,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 514 FFV1_0( w_fp[56], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[513] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12032,7 +12029,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 516 FFV1_0( w_fp[86], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[515] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12049,7 +12046,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 517 FFV1_0( w_fp[52], w_fp[98], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[516] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12066,7 +12063,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 518 FFV1_0( w_fp[52], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[517] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12085,7 +12082,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 519 FFV1_0( w_fp[86], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[518] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12102,7 +12099,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 520 FFV1_0( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[519] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12119,7 +12116,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 521 FFV1_0( w_fp[52], w_fp[47], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[520] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12138,7 +12135,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 522 FFV1_0( w_fp[86], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[521] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12157,7 +12154,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 523 FFV1_0( w_fp[52], w_fp[112], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[522] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12176,7 +12173,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 524 FFV1_0( w_fp[52], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[523] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12199,7 +12196,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 525 FFV1_0( w_fp[65], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[524] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12218,7 +12215,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 526 FFV1_0( w_fp[3], w_fp[112], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[525] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12241,7 +12238,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 527 FFV1_0( w_fp[99], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[526] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12260,7 +12257,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 528 FFV1_0( w_fp[99], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[527] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12283,7 +12280,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 529 FFV1_0( w_fp[3], w_fp[93], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[528] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12306,7 +12303,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 530 FFV1_0( w_fp[65], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[529] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12387,7 +12384,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 532 VVV1_0( w_fp[8], w_fp[6], w_fp[86], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[531] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12418,7 +12415,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 533 VVV1_0( w_fp[61], w_fp[6], w_fp[101], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[532] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12449,7 +12446,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 534 VVV1_0( w_fp[61], w_fp[8], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[533] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12480,7 +12477,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 535 FFV1_0( w_fp[3], w_fp[47], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[534] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12503,7 +12500,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 536 FFV1_0( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[535] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12522,7 +12519,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 537 FFV1_0( w_fp[99], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[536] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12541,7 +12538,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 538 FFV1_0( w_fp[41], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[537] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12564,7 +12561,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 539 FFV1_0( w_fp[41], w_fp[112], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[538] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12583,7 +12580,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 540 FFV1_0( w_fp[113], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[539] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12602,7 +12599,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 541 FFV1_0( w_fp[76], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[540] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12621,7 +12618,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 542 FFV1_0( w_fp[3], w_fp[112], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[541] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12644,7 +12641,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 543 FFV1_0( w_fp[99], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[542] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12663,7 +12660,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 544 FFV1_0( w_fp[99], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[543] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12686,7 +12683,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 545 FFV1_0( w_fp[3], w_fp[97], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[544] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12709,7 +12706,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 546 FFV1_0( w_fp[76], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[545] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12790,7 +12787,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 548 VVV1_0( w_fp[8], w_fp[4], w_fp[86], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[547] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12821,7 +12818,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 549 VVV1_0( w_fp[72], w_fp[4], w_fp[101], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[548] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12852,7 +12849,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 550 VVV1_0( w_fp[72], w_fp[8], w_fp[102], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[549] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12883,7 +12880,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 551 FFV1_0( w_fp[3], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[550] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12906,7 +12903,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 552 FFV1_0( w_fp[3], w_fp[98], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[551] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12925,7 +12922,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 553 FFV1_0( w_fp[99], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[552] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12944,7 +12941,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 554 FFV1_0( w_fp[46], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[553] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12967,7 +12964,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 555 FFV1_0( w_fp[46], w_fp[112], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[554] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -12986,7 +12983,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 556 FFV1_0( w_fp[114], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[555] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13005,7 +13002,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 557 VVV1_0( w_fp[86], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[556] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13036,7 +13033,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 558 VVV1_0( w_fp[86], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[557] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13125,7 +13122,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 560 VVV1_0( w_fp[102], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[559] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13156,7 +13153,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 561 VVV1_0( w_fp[102], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[560] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13245,7 +13242,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 563 VVV1_0( w_fp[104], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[562] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13276,7 +13273,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 564 VVV1_0( w_fp[104], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[563] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13601,7 +13598,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 570 VVV1_0( w_fp[8], w_fp[27], w_fp[86], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[569] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13632,7 +13629,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 571 VVV1_0( w_fp[1], w_fp[27], w_fp[101], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[570] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13663,7 +13660,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 572 VVV1_0( w_fp[1], w_fp[8], w_fp[62], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[571] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13694,7 +13691,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 573 VVV1_0( w_fp[86], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[572] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13717,7 +13714,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 574 FFV1_0( w_fp[3], w_fp[36], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[573] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13736,7 +13733,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 575 FFV1_0( w_fp[99], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[574] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13753,7 +13750,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 576 FFV1_0( w_fp[99], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[575] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13770,7 +13767,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 577 FFV1_0( w_fp[3], w_fp[100], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[576] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13789,7 +13786,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 578 VVV1_0( w_fp[104], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[577] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13846,7 +13843,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 580 FFV1_0( w_fp[41], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[579] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13865,7 +13862,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 581 FFV1_0( w_fp[41], w_fp[98], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[580] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13882,7 +13879,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 582 FFV1_0( w_fp[113], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[581] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13899,7 +13896,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 583 VVV1_0( w_fp[86], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[582] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13922,7 +13919,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 584 FFV1_0( w_fp[3], w_fp[49], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[583] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13941,7 +13938,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 585 FFV1_0( w_fp[99], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[584] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13958,7 +13955,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 586 FFV1_0( w_fp[99], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[585] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13975,7 +13972,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 587 FFV1_0( w_fp[3], w_fp[91], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[586] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -13994,7 +13991,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 588 VVV1_0( w_fp[102], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[587] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14051,7 +14048,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 590 FFV1_0( w_fp[46], w_fp[47], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[589] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14070,7 +14067,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 591 FFV1_0( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[590] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14087,7 +14084,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 592 FFV1_0( w_fp[114], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[591] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14104,7 +14101,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 593 VVV1_0( w_fp[86], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[592] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14127,7 +14124,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 594 FFV1_0( w_fp[53], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[593] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14146,7 +14143,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 595 FFV1_0( w_fp[78], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[594] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14163,7 +14160,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 596 FFV1_0( w_fp[53], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[595] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14180,7 +14177,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 597 FFV1_0( w_fp[78], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[596] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14199,7 +14196,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 598 VVV1_0( w_fp[104], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[597] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14256,7 +14253,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 600 VVV1_0( w_fp[86], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[599] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14279,7 +14276,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 601 FFV1_0( w_fp[28], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[600] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14298,7 +14295,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 602 FFV1_0( w_fp[60], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[601] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14315,7 +14312,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 603 FFV1_0( w_fp[28], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[602] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14332,7 +14329,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 604 FFV1_0( w_fp[60], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[603] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14351,7 +14348,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 605 VVV1_0( w_fp[102], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[604] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14408,7 +14405,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 607 FFV1_0( w_fp[3], w_fp[15], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[606] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14431,7 +14428,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 608 FFV1_0( w_fp[14], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[607] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14454,7 +14451,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 609 FFV1_0( w_fp[3], w_fp[112], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[608] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14477,7 +14474,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 610 FFV1_0( w_fp[14], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[609] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14496,7 +14493,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 611 FFV1_0( w_fp[99], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[610] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14519,7 +14516,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 612 FFV1_0( w_fp[99], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[611] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14665,7 +14662,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 616 FFV1_0( w_fp[99], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[615] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14682,7 +14679,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 617 FFV1_0( w_fp[99], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[616] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14699,7 +14696,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 618 VVV1_0( w_fp[112], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[617] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14722,7 +14719,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 619 FFV1_0( w_fp[3], w_fp[9], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[618] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14741,7 +14738,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 620 VVV1_0( w_fp[86], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[619] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14764,7 +14761,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 621 FFV1_0( w_fp[3], w_fp[87], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[620] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14819,7 +14816,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 623 FFV1_0( w_fp[46], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[622] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14836,7 +14833,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 624 FFV1_0( w_fp[88], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[623] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14853,7 +14850,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 625 FFV1_0( w_fp[46], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[624] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14872,7 +14869,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 626 FFV1_0( w_fp[38], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[625] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14889,7 +14886,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 627 FFV1_0( w_fp[90], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[626] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14906,7 +14903,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 628 FFV1_0( w_fp[38], w_fp[77], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[627] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14925,7 +14922,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 629 FFV1_0( w_fp[3], w_fp[102], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[628] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14944,7 +14941,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 630 FFV1_0( w_fp[99], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[629] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14963,7 +14960,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 631 FFV1_0( w_fp[3], w_fp[77], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[630] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -14986,7 +14983,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 632 FFV1_0( w_fp[56], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[631] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15003,7 +15000,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 633 FFV1_0( w_fp[22], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[632] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15020,7 +15017,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 634 VVV1_0( w_fp[112], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[633] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15043,7 +15040,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 635 FFV1_0( w_fp[22], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[634] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15062,7 +15059,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 636 VVV1_0( w_fp[86], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[635] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15085,7 +15082,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 637 FFV1_0( w_fp[56], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[636] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15138,7 +15135,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 639 FFV1_0( w_fp[104], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[638] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15155,7 +15152,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 640 FFV1_0( w_fp[52], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[639] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15172,7 +15169,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 641 FFV1_0( w_fp[52], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[640] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15191,7 +15188,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 642 FFV1_0( w_fp[104], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[641] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15208,7 +15205,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 643 FFV1_0( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[642] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15225,7 +15222,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 644 FFV1_0( w_fp[52], w_fp[39], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[643] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15244,7 +15241,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 645 FFV1_0( w_fp[104], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[644] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15263,7 +15260,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 646 FFV1_0( w_fp[52], w_fp[96], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[645] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15282,7 +15279,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 647 FFV1_0( w_fp[52], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[646] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15305,7 +15302,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 648 FFV1_0( w_fp[65], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[647] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15324,7 +15321,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 649 FFV1_0( w_fp[3], w_fp[96], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[648] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15347,7 +15344,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 650 FFV1_0( w_fp[99], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[649] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15366,7 +15363,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 651 FFV1_0( w_fp[99], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[650] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15389,7 +15386,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 652 FFV1_0( w_fp[3], w_fp[93], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[651] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15412,7 +15409,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 653 FFV1_0( w_fp[65], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[652] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15493,7 +15490,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 655 VVV1_0( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[654] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15524,7 +15521,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 656 VVV1_0( w_fp[61], w_fp[5], w_fp[113], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[655] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15555,7 +15552,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 657 VVV1_0( w_fp[61], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[656] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15586,7 +15583,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 658 FFV1_0( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[657] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15609,7 +15606,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 659 FFV1_0( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[658] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15628,7 +15625,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 660 FFV1_0( w_fp[99], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[659] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15647,7 +15644,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 661 FFV1_0( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[660] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15670,7 +15667,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 662 FFV1_0( w_fp[38], w_fp[96], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[661] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15689,7 +15686,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 663 FFV1_0( w_fp[90], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[662] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15708,7 +15705,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 664 FFV1_0( w_fp[71], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[663] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15727,7 +15724,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 665 FFV1_0( w_fp[3], w_fp[96], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[664] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15750,7 +15747,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 666 FFV1_0( w_fp[99], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[665] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15769,7 +15766,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 667 FFV1_0( w_fp[99], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[666] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15792,7 +15789,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 668 FFV1_0( w_fp[3], w_fp[94], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[667] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15815,7 +15812,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 669 FFV1_0( w_fp[71], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[668] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15896,7 +15893,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 671 VVV1_0( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[670] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15927,7 +15924,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 672 VVV1_0( w_fp[66], w_fp[4], w_fp[113], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[671] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15958,7 +15955,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 673 VVV1_0( w_fp[66], w_fp[8], w_fp[112], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[672] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -15989,7 +15986,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 674 FFV1_0( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[673] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16012,7 +16009,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 675 FFV1_0( w_fp[3], w_fp[114], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[674] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16031,7 +16028,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 676 FFV1_0( w_fp[99], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[675] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16050,7 +16047,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 677 FFV1_0( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[676] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16073,7 +16070,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 678 FFV1_0( w_fp[46], w_fp[96], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[677] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16092,7 +16089,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 679 FFV1_0( w_fp[88], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[678] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16111,7 +16108,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 680 VVV1_0( w_fp[104], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[679] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16142,7 +16139,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 681 VVV1_0( w_fp[104], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[680] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16231,7 +16228,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 683 VVV1_0( w_fp[112], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[682] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16262,7 +16259,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 684 VVV1_0( w_fp[112], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[683] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16351,7 +16348,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 686 VVV1_0( w_fp[86], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[685] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16382,7 +16379,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 687 VVV1_0( w_fp[86], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[686] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16707,7 +16704,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 693 VVV1_0( w_fp[8], w_fp[24], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[692] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16738,7 +16735,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 694 VVV1_0( w_fp[1], w_fp[24], w_fp[113], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[693] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16769,7 +16766,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 695 VVV1_0( w_fp[1], w_fp[8], w_fp[102], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[694] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16800,7 +16797,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 696 VVV1_0( w_fp[104], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[695] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16823,7 +16820,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 697 FFV1_0( w_fp[3], w_fp[35], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[696] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16842,7 +16839,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 698 FFV1_0( w_fp[99], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[697] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16859,7 +16856,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 699 FFV1_0( w_fp[99], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[698] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16876,7 +16873,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 700 FFV1_0( w_fp[3], w_fp[100], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[699] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16895,7 +16892,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 701 VVV1_0( w_fp[86], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[700] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16952,7 +16949,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 703 FFV1_0( w_fp[38], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[702] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16971,7 +16968,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 704 FFV1_0( w_fp[38], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[703] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -16988,7 +16985,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 705 FFV1_0( w_fp[90], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[704] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17005,7 +17002,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 706 VVV1_0( w_fp[104], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[705] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17028,7 +17025,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 707 FFV1_0( w_fp[3], w_fp[43], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[706] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17047,7 +17044,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 708 FFV1_0( w_fp[99], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[707] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17064,7 +17061,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 709 FFV1_0( w_fp[99], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[708] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17081,7 +17078,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 710 FFV1_0( w_fp[3], w_fp[89], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[709] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17100,7 +17097,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 711 VVV1_0( w_fp[112], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[710] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17157,7 +17154,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 713 FFV1_0( w_fp[46], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[712] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17176,7 +17173,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 714 FFV1_0( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[713] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17193,7 +17190,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 715 FFV1_0( w_fp[88], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[714] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17210,7 +17207,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 716 VVV1_0( w_fp[104], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[715] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17233,7 +17230,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 717 FFV1_0( w_fp[7], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[716] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17252,7 +17249,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 718 FFV1_0( w_fp[78], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[717] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17269,7 +17266,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 719 FFV1_0( w_fp[7], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[718] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17286,7 +17283,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 720 FFV1_0( w_fp[78], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[719] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17305,7 +17302,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 721 VVV1_0( w_fp[86], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[720] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17362,7 +17359,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 723 VVV1_0( w_fp[104], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[722] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17385,7 +17382,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 724 FFV1_0( w_fp[25], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[723] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17404,7 +17401,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 725 FFV1_0( w_fp[58], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[724] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17421,7 +17418,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 726 FFV1_0( w_fp[25], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[725] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17438,7 +17435,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 727 FFV1_0( w_fp[58], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[726] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17457,7 +17454,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 728 VVV1_0( w_fp[112], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[727] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17514,7 +17511,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 730 FFV1_0( w_fp[3], w_fp[17], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[729] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17537,7 +17534,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 731 FFV1_0( w_fp[26], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[730] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17560,7 +17557,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 732 FFV1_0( w_fp[3], w_fp[96], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[731] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17583,7 +17580,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 733 FFV1_0( w_fp[26], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[732] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17602,7 +17599,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 734 FFV1_0( w_fp[99], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[733] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17625,7 +17622,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 735 FFV1_0( w_fp[99], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[734] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17770,7 +17767,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 739 FFV1_0( w_fp[7], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[738] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17786,7 +17783,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 740 FFV1_0( w_fp[53], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[739] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17802,7 +17799,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 741 FFV1_0( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[740] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17818,7 +17815,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 742 FFV1_0( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[741] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17834,7 +17831,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 743 FFV1_0( w_fp[53], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[742] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17850,7 +17847,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 744 FFV1_0( w_fp[7], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[743] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17866,7 +17863,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 745 FFV1_0( w_fp[46], w_fp[92], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[744] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17883,7 +17880,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 746 FFV1_0( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[745] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17900,7 +17897,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 747 FFV1_0( w_fp[46], w_fp[77], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[746] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17919,7 +17916,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 748 FFV1_0( w_fp[25], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[747] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17935,7 +17932,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 749 FFV1_0( w_fp[48], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[748] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17951,7 +17948,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 750 FFV1_0( w_fp[104], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[749] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17967,7 +17964,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 751 FFV1_0( w_fp[104], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[750] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17983,7 +17980,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 752 FFV1_0( w_fp[48], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[751] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -17999,7 +17996,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 753 FFV1_0( w_fp[25], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[752] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18015,7 +18012,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 754 FFV1_0( w_fp[38], w_fp[92], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[753] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18032,7 +18029,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 755 FFV1_0( w_fp[104], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[754] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18049,7 +18046,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 756 FFV1_0( w_fp[38], w_fp[77], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[755] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18068,7 +18065,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 757 FFV1_0( w_fp[28], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[756] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18084,7 +18081,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 758 FFV1_0( w_fp[40], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[757] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18100,7 +18097,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 759 FFV1_0( w_fp[62], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[758] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18116,7 +18113,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 760 FFV1_0( w_fp[62], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[759] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18132,7 +18129,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 761 FFV1_0( w_fp[40], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[760] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18148,7 +18145,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 762 FFV1_0( w_fp[28], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[761] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18164,7 +18161,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 763 FFV1_0( w_fp[41], w_fp[92], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[762] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18181,7 +18178,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 764 FFV1_0( w_fp[62], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[763] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18198,7 +18195,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 765 FFV1_0( w_fp[41], w_fp[77], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[764] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18217,7 +18214,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 766 FFV1_0( w_fp[26], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[765] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18234,7 +18231,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 767 FFV1_0( w_fp[3], w_fp[92], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[766] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18253,7 +18250,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 768 VVV1_0( w_fp[98], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[767] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18276,7 +18273,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 769 FFV1_0( w_fp[3], w_fp[85], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[768] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18295,7 +18292,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 770 VVV1_0( w_fp[0], w_fp[34], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[769] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18318,7 +18315,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 771 FFV1_0( w_fp[26], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[770] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18371,7 +18368,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 773 FFV1_0( w_fp[14], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[772] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18388,7 +18385,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 774 FFV1_0( w_fp[3], w_fp[92], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[773] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18407,7 +18404,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 775 VVV1_0( w_fp[101], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[774] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18430,7 +18427,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 776 FFV1_0( w_fp[3], w_fp[9], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[775] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18449,7 +18446,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 777 VVV1_0( w_fp[0], w_fp[34], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[776] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18472,7 +18469,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 778 FFV1_0( w_fp[14], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[777] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18525,7 +18522,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 780 FFV1_0( w_fp[12], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[779] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18542,7 +18539,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 781 FFV1_0( w_fp[3], w_fp[92], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[780] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18561,7 +18558,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 782 VVV1_0( w_fp[96], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[781] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18584,7 +18581,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 783 FFV1_0( w_fp[3], w_fp[87], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[782] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18603,7 +18600,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 784 VVV1_0( w_fp[0], w_fp[34], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[783] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18626,7 +18623,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 785 FFV1_0( w_fp[12], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[784] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18737,7 +18734,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 789 FFV1_0( w_fp[90], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[788] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18753,7 +18750,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 790 FFV1_0( w_fp[90], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[789] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18769,7 +18766,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 791 FFV1_0( w_fp[22], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[790] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18785,7 +18782,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 792 FFV1_0( w_fp[21], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[791] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18801,7 +18798,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 793 FFV1_0( w_fp[22], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[792] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18817,7 +18814,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 794 FFV1_0( w_fp[21], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[793] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18833,7 +18830,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 795 FFV1_0( w_fp[90], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[794] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18850,7 +18847,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 796 FFV1_0( w_fp[52], w_fp[114], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[795] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18867,7 +18864,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 797 FFV1_0( w_fp[52], w_fp[33], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[796] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18886,7 +18883,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 798 FFV1_0( w_fp[90], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[797] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18902,7 +18899,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 799 FFV1_0( w_fp[90], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[798] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18918,7 +18915,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 800 FFV1_0( w_fp[56], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[799] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18934,7 +18931,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 801 FFV1_0( w_fp[21], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[800] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18950,7 +18947,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 802 FFV1_0( w_fp[56], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[801] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18966,7 +18963,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 803 FFV1_0( w_fp[21], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[802] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18982,7 +18979,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 804 FFV1_0( w_fp[90], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[803] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -18999,7 +18996,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 805 FFV1_0( w_fp[52], w_fp[102], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[804] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19016,7 +19013,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 806 FFV1_0( w_fp[52], w_fp[39], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[805] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19035,7 +19032,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 807 FFV1_0( w_fp[90], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[806] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19051,7 +19048,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 808 FFV1_0( w_fp[90], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[807] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19067,7 +19064,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 809 FFV1_0( w_fp[56], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[808] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19083,7 +19080,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 810 FFV1_0( w_fp[22], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[809] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19099,7 +19096,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 811 FFV1_0( w_fp[56], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[810] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19115,7 +19112,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 812 FFV1_0( w_fp[22], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[811] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19131,7 +19128,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 813 FFV1_0( w_fp[90], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[812] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19148,7 +19145,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 814 FFV1_0( w_fp[52], w_fp[113], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[813] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19165,7 +19162,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 815 FFV1_0( w_fp[52], w_fp[47], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[814] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19184,7 +19181,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 816 FFV1_0( w_fp[90], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[815] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19201,7 +19198,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 817 FFV1_0( w_fp[90], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[816] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19220,7 +19217,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 818 VVV1_0( w_fp[98], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[817] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19243,7 +19240,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 819 FFV1_0( w_fp[21], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[818] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19262,7 +19259,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 820 VVV1_0( w_fp[0], w_fp[103], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[819] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19285,7 +19282,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 821 FFV1_0( w_fp[21], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[820] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19336,7 +19333,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 823 FFV1_0( w_fp[90], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[822] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19353,7 +19350,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 824 FFV1_0( w_fp[90], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[823] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19372,7 +19369,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 825 VVV1_0( w_fp[101], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[824] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19395,7 +19392,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 826 FFV1_0( w_fp[22], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[825] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19414,7 +19411,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 827 VVV1_0( w_fp[0], w_fp[103], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[826] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19437,7 +19434,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 828 FFV1_0( w_fp[22], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[827] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19488,7 +19485,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 830 FFV1_0( w_fp[90], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[829] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19505,7 +19502,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 831 FFV1_0( w_fp[90], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[830] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19524,7 +19521,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 832 VVV1_0( w_fp[96], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[831] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19547,7 +19544,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 833 FFV1_0( w_fp[56], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[832] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19566,7 +19563,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 834 VVV1_0( w_fp[0], w_fp[103], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[833] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19589,7 +19586,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 835 FFV1_0( w_fp[56], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[834] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19696,7 +19693,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 839 VVV1_0( w_fp[90], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[838] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19727,7 +19724,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 840 VVV1_0( w_fp[90], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[839] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19816,7 +19813,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 842 VVV1_0( w_fp[56], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[841] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19847,7 +19844,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 843 VVV1_0( w_fp[56], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[842] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19936,7 +19933,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 845 VVV1_0( w_fp[0], w_fp[63], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[844] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -19967,7 +19964,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 846 VVV1_0( w_fp[0], w_fp[64], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[845] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20296,7 +20293,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 852 VVV1_0( w_fp[8], w_fp[29], w_fp[90], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[851] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20327,7 +20324,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 853 VVV1_0( w_fp[61], w_fp[29], w_fp[56], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[852] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20358,7 +20355,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 854 VVV1_0( w_fp[61], w_fp[8], w_fp[96], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[853] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20389,7 +20386,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 855 VVV1_0( w_fp[90], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[854] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20412,7 +20409,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 856 FFV1_0( w_fp[3], w_fp[44], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[855] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20431,7 +20428,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 857 FFV1_0( w_fp[65], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[856] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20448,7 +20445,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 858 FFV1_0( w_fp[3], w_fp[102], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[857] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20467,7 +20464,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 859 FFV1_0( w_fp[65], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[858] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20484,7 +20481,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 860 VVV1_0( w_fp[0], w_fp[64], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[859] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20541,7 +20538,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 862 FFV1_0( w_fp[41], w_fp[39], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[861] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20560,7 +20557,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 863 FFV1_0( w_fp[41], w_fp[102], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[862] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20577,7 +20574,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 864 FFV1_0( w_fp[62], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[863] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20594,7 +20591,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 865 VVV1_0( w_fp[90], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[864] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20617,7 +20614,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 866 FFV1_0( w_fp[3], w_fp[50], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[865] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20636,7 +20633,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 867 FFV1_0( w_fp[65], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[866] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20653,7 +20650,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 868 FFV1_0( w_fp[3], w_fp[113], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[867] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20672,7 +20669,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 869 FFV1_0( w_fp[65], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[868] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20689,7 +20686,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 870 VVV1_0( w_fp[0], w_fp[63], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[869] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20746,7 +20743,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 872 FFV1_0( w_fp[38], w_fp[47], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[871] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20765,7 +20762,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 873 FFV1_0( w_fp[38], w_fp[113], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[872] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20782,7 +20779,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 874 FFV1_0( w_fp[104], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[873] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20799,7 +20796,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 875 VVV1_0( w_fp[90], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[874] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20822,7 +20819,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 876 FFV1_0( w_fp[48], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[875] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20841,7 +20838,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 877 FFV1_0( w_fp[104], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[876] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20858,7 +20855,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 878 FFV1_0( w_fp[104], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[877] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20877,7 +20874,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 879 FFV1_0( w_fp[48], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[878] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20894,7 +20891,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 880 VVV1_0( w_fp[0], w_fp[64], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[879] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20951,7 +20948,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 882 VVV1_0( w_fp[90], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[881] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20974,7 +20971,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 883 FFV1_0( w_fp[40], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[882] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -20993,7 +20990,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 884 FFV1_0( w_fp[62], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[883] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21010,7 +21007,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 885 FFV1_0( w_fp[62], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[884] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21029,7 +21026,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 886 FFV1_0( w_fp[40], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[885] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21046,7 +21043,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 887 VVV1_0( w_fp[0], w_fp[63], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[886] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21103,7 +21100,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 889 FFV1_0( w_fp[3], w_fp[18], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[888] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21126,7 +21123,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 890 FFV1_0( w_fp[12], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[889] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21149,7 +21146,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 891 FFV1_0( w_fp[3], w_fp[93], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[890] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21172,7 +21169,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 892 FFV1_0( w_fp[65], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[891] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21195,7 +21192,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 893 FFV1_0( w_fp[12], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[892] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21214,7 +21211,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 894 FFV1_0( w_fp[65], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[893] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21233,7 +21230,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 895 VVV1_0( w_fp[65], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[894] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21264,7 +21261,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 896 VVV1_0( w_fp[65], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[895] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21353,7 +21350,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 898 VVV1_0( w_fp[56], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[897] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21384,7 +21381,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 899 VVV1_0( w_fp[56], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[898] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21473,7 +21470,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 901 VVV1_0( w_fp[0], w_fp[69], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[900] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21504,7 +21501,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 902 VVV1_0( w_fp[0], w_fp[70], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[901] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21831,7 +21828,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 908 VVV1_0( w_fp[8], w_fp[27], w_fp[65], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[907] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21862,7 +21859,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 909 VVV1_0( w_fp[66], w_fp[27], w_fp[56], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[908] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21893,7 +21890,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 910 VVV1_0( w_fp[66], w_fp[8], w_fp[101], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[909] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21924,7 +21921,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 911 VVV1_0( w_fp[65], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[910] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21947,7 +21944,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 912 FFV1_0( w_fp[3], w_fp[36], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[911] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21966,7 +21963,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 913 FFV1_0( w_fp[71], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[912] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -21983,7 +21980,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 914 FFV1_0( w_fp[3], w_fp[114], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[913] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22002,7 +21999,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 915 FFV1_0( w_fp[71], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[914] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22019,7 +22016,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 916 VVV1_0( w_fp[0], w_fp[70], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[915] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22076,7 +22073,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 918 FFV1_0( w_fp[41], w_fp[33], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[917] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22095,7 +22092,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 919 FFV1_0( w_fp[41], w_fp[114], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[918] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22112,7 +22109,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 920 FFV1_0( w_fp[62], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[919] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22129,7 +22126,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 921 VVV1_0( w_fp[65], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[920] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22152,7 +22149,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 922 FFV1_0( w_fp[3], w_fp[49], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[921] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22171,7 +22168,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 923 FFV1_0( w_fp[71], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[922] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22188,7 +22185,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 924 FFV1_0( w_fp[3], w_fp[113], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[923] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22207,7 +22204,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 925 FFV1_0( w_fp[71], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[924] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22224,7 +22221,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 926 VVV1_0( w_fp[0], w_fp[69], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[925] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22281,7 +22278,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 928 FFV1_0( w_fp[46], w_fp[47], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[927] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22300,7 +22297,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 929 FFV1_0( w_fp[46], w_fp[113], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[928] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22317,7 +22314,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 930 FFV1_0( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[929] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22334,7 +22331,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 931 VVV1_0( w_fp[65], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[930] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22357,7 +22354,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 932 FFV1_0( w_fp[53], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[931] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22376,7 +22373,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 933 FFV1_0( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[932] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22393,7 +22390,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 934 FFV1_0( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[933] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22412,7 +22409,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 935 FFV1_0( w_fp[53], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[934] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22429,7 +22426,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 936 VVV1_0( w_fp[0], w_fp[70], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[935] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22486,7 +22483,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 938 VVV1_0( w_fp[65], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[937] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22509,7 +22506,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 939 FFV1_0( w_fp[28], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[938] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22528,7 +22525,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 940 FFV1_0( w_fp[62], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[939] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22545,7 +22542,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 941 FFV1_0( w_fp[62], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[940] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22564,7 +22561,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 942 FFV1_0( w_fp[28], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[941] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22581,7 +22578,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 943 VVV1_0( w_fp[0], w_fp[69], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[942] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22638,7 +22635,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 945 FFV1_0( w_fp[3], w_fp[15], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[944] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22661,7 +22658,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 946 FFV1_0( w_fp[14], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[945] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22684,7 +22681,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 947 FFV1_0( w_fp[3], w_fp[94], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[946] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22707,7 +22704,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 948 FFV1_0( w_fp[71], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[947] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22730,7 +22727,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 949 FFV1_0( w_fp[14], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[948] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22749,7 +22746,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 950 FFV1_0( w_fp[71], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[949] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22768,7 +22765,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 951 VVV1_0( w_fp[71], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[950] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22799,7 +22796,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 952 VVV1_0( w_fp[71], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[951] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22888,7 +22885,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 954 VVV1_0( w_fp[56], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[953] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -22919,7 +22916,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 955 VVV1_0( w_fp[56], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[954] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23008,7 +23005,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 957 VVV1_0( w_fp[0], w_fp[74], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[956] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23039,7 +23036,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 958 VVV1_0( w_fp[0], w_fp[75], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[957] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23364,7 +23361,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 964 VVV1_0( w_fp[8], w_fp[24], w_fp[71], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[963] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23395,7 +23392,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 965 VVV1_0( w_fp[72], w_fp[24], w_fp[56], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[964] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23426,7 +23423,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 966 VVV1_0( w_fp[72], w_fp[8], w_fp[98], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[965] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23457,7 +23454,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 967 VVV1_0( w_fp[71], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[966] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23480,7 +23477,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 968 FFV1_0( w_fp[3], w_fp[35], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[967] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23499,7 +23496,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 969 FFV1_0( w_fp[76], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[968] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23516,7 +23513,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 970 FFV1_0( w_fp[3], w_fp[114], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[969] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23535,7 +23532,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 971 FFV1_0( w_fp[76], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[970] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23552,7 +23549,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 972 VVV1_0( w_fp[0], w_fp[75], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[971] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23609,7 +23606,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 974 FFV1_0( w_fp[38], w_fp[33], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[973] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23628,7 +23625,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 975 FFV1_0( w_fp[38], w_fp[114], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[974] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23645,7 +23642,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 976 FFV1_0( w_fp[104], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[975] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23662,7 +23659,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 977 VVV1_0( w_fp[71], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[976] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23685,7 +23682,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 978 FFV1_0( w_fp[3], w_fp[43], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[977] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23704,7 +23701,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 979 FFV1_0( w_fp[76], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[978] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23721,7 +23718,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 980 FFV1_0( w_fp[3], w_fp[102], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[979] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23740,7 +23737,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 981 FFV1_0( w_fp[76], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[980] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23757,7 +23754,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 982 VVV1_0( w_fp[0], w_fp[74], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[981] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23814,7 +23811,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 984 FFV1_0( w_fp[46], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[983] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23833,7 +23830,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 985 FFV1_0( w_fp[46], w_fp[102], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[984] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23850,7 +23847,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 986 FFV1_0( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[985] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23867,7 +23864,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 987 VVV1_0( w_fp[71], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[986] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23890,7 +23887,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 988 FFV1_0( w_fp[7], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[987] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23909,7 +23906,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 989 FFV1_0( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[988] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23926,7 +23923,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 990 FFV1_0( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[989] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23945,7 +23942,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 991 FFV1_0( w_fp[7], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[990] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -23962,7 +23959,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 992 VVV1_0( w_fp[0], w_fp[75], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[991] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24019,7 +24016,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 994 VVV1_0( w_fp[71], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[993] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24042,7 +24039,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 995 FFV1_0( w_fp[25], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[994] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24061,7 +24058,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 996 FFV1_0( w_fp[104], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[995] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24078,7 +24075,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 997 FFV1_0( w_fp[104], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[996] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24097,7 +24094,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 998 FFV1_0( w_fp[25], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[997] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24114,7 +24111,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 999 VVV1_0( w_fp[0], w_fp[74], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[998] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24171,7 +24168,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1001 FFV1_0( w_fp[3], w_fp[17], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1000] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24194,7 +24191,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1002 FFV1_0( w_fp[26], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1001] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24217,7 +24214,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1003 FFV1_0( w_fp[3], w_fp[97], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1002] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24240,7 +24237,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1004 FFV1_0( w_fp[76], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1003] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24263,7 +24260,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1005 FFV1_0( w_fp[26], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1004] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24282,7 +24279,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1006 FFV1_0( w_fp[76], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1005] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24301,7 +24298,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1007 VVV1_0( w_fp[56], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1006] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24332,7 +24329,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1008 VVV1_0( w_fp[56], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1007] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24421,7 +24418,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1010 VVV1_0( w_fp[98], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1009] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24452,7 +24449,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1011 VVV1_0( w_fp[98], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1010] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24541,7 +24538,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1013 VVV1_0( w_fp[0], w_fp[108], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1012] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24572,7 +24569,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1014 VVV1_0( w_fp[0], w_fp[59], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1013] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24839,7 +24836,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1019 VVV1_0( w_fp[56], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1018] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24870,7 +24867,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1020 VVV1_0( w_fp[56], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1019] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24959,7 +24956,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1022 VVV1_0( w_fp[101], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1021] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -24990,7 +24987,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1023 VVV1_0( w_fp[101], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1022] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -25079,7 +25076,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1025 VVV1_0( w_fp[0], w_fp[108], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1024] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -25110,7 +25107,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1026 VVV1_0( w_fp[0], w_fp[68], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1025] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -25375,7 +25372,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1031 VVV1_0( w_fp[56], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1030] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -25406,7 +25403,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1032 VVV1_0( w_fp[56], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1031] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -25495,7 +25492,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1034 VVV1_0( w_fp[96], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1033] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -25526,7 +25523,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1035 VVV1_0( w_fp[96], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1034] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -25615,7 +25612,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1037 VVV1_0( w_fp[0], w_fp[108], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1036] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -25646,7 +25643,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1038 VVV1_0( w_fp[0], w_fp[67], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1037] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26187,7 +26184,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1046 FFV1_0( w_fp[58], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1045] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26203,7 +26200,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1047 FFV1_0( w_fp[48], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1046] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26219,7 +26216,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1048 FFV1_0( w_fp[104], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1047] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26235,7 +26232,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1049 FFV1_0( w_fp[104], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1048] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26251,7 +26248,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1050 FFV1_0( w_fp[48], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1049] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26267,7 +26264,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1051 FFV1_0( w_fp[58], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1050] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26283,7 +26280,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1052 FFV1_0( w_fp[60], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1051] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26299,7 +26296,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1053 FFV1_0( w_fp[40], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1052] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26315,7 +26312,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1054 FFV1_0( w_fp[62], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1053] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26331,7 +26328,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1055 FFV1_0( w_fp[62], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1054] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26347,7 +26344,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1056 FFV1_0( w_fp[40], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1055] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26363,7 +26360,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1057 FFV1_0( w_fp[60], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1056] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26379,7 +26376,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1058 FFV1_0( w_fp[3], w_fp[114], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1057] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26398,7 +26395,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1059 FFV1_0( w_fp[12], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1058] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26415,7 +26412,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1060 FFV1_0( w_fp[3], w_fp[100], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1059] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26434,7 +26431,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1061 VVV1_0( w_fp[96], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1060] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26457,7 +26454,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1062 FFV1_0( w_fp[12], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1061] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26474,7 +26471,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1063 VVV1_0( w_fp[0], w_fp[67], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1062] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26531,7 +26528,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1065 FFV1_0( w_fp[78], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1064] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26547,7 +26544,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1066 FFV1_0( w_fp[53], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1065] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26563,7 +26560,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1067 FFV1_0( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1066] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26579,7 +26576,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1068 FFV1_0( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1067] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26595,7 +26592,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1069 FFV1_0( w_fp[53], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1068] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26611,7 +26608,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1070 FFV1_0( w_fp[78], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1069] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26627,7 +26624,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1071 FFV1_0( w_fp[60], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1070] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26643,7 +26640,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1072 FFV1_0( w_fp[28], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1071] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26659,7 +26656,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1073 FFV1_0( w_fp[62], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1072] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26675,7 +26672,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1074 FFV1_0( w_fp[62], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1073] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26691,7 +26688,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1075 FFV1_0( w_fp[28], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1074] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26707,7 +26704,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1076 FFV1_0( w_fp[60], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1075] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26723,7 +26720,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1077 FFV1_0( w_fp[3], w_fp[102], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1076] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26742,7 +26739,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1078 FFV1_0( w_fp[14], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1077] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26759,7 +26756,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1079 FFV1_0( w_fp[3], w_fp[89], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1078] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26778,7 +26775,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1080 VVV1_0( w_fp[101], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1079] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26801,7 +26798,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1081 FFV1_0( w_fp[14], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1080] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26818,7 +26815,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1082 VVV1_0( w_fp[0], w_fp[68], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1081] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26875,7 +26872,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1084 FFV1_0( w_fp[78], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1083] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26891,7 +26888,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1085 FFV1_0( w_fp[7], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1084] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26907,7 +26904,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1086 FFV1_0( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1085] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26923,7 +26920,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1087 FFV1_0( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1086] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26939,7 +26936,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1088 FFV1_0( w_fp[7], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1087] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26955,7 +26952,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1089 FFV1_0( w_fp[78], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1088] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26971,7 +26968,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1090 FFV1_0( w_fp[58], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1089] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -26987,7 +26984,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1091 FFV1_0( w_fp[25], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1090] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27003,7 +27000,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1092 FFV1_0( w_fp[104], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1091] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27019,7 +27016,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1093 FFV1_0( w_fp[104], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1092] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27035,7 +27032,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1094 FFV1_0( w_fp[25], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1093] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27051,7 +27048,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1095 FFV1_0( w_fp[58], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1094] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27067,7 +27064,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1096 FFV1_0( w_fp[3], w_fp[113], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1095] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27086,7 +27083,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1097 FFV1_0( w_fp[26], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1096] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27103,7 +27100,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1098 FFV1_0( w_fp[3], w_fp[91], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1097] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27122,7 +27119,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1099 VVV1_0( w_fp[98], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1098] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27145,7 +27142,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1100 FFV1_0( w_fp[26], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1099] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27162,7 +27159,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1101 VVV1_0( w_fp[0], w_fp[59], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1100] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27219,7 +27216,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1103 FFV1_0( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1102] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27238,7 +27235,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1104 FFV1_0( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1103] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27255,7 +27252,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1105 FFV1_0( w_fp[78], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1104] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27274,7 +27271,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1106 VVV1_0( w_fp[96], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1105] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27297,7 +27294,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1107 FFV1_0( w_fp[78], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1106] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27314,7 +27311,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1108 VVV1_0( w_fp[0], w_fp[67], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1107] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27371,7 +27368,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1110 FFV1_0( w_fp[104], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1109] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27390,7 +27387,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1111 FFV1_0( w_fp[104], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1110] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27407,7 +27404,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1112 FFV1_0( w_fp[58], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1111] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27426,7 +27423,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1113 VVV1_0( w_fp[101], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1112] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27449,7 +27446,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1114 FFV1_0( w_fp[58], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1113] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27466,7 +27463,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1115 VVV1_0( w_fp[0], w_fp[68], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1114] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27523,7 +27520,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1117 FFV1_0( w_fp[62], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1116] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27542,7 +27539,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1118 FFV1_0( w_fp[62], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1117] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27559,7 +27556,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1119 FFV1_0( w_fp[60], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1118] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27578,7 +27575,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1120 VVV1_0( w_fp[98], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1119] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27601,7 +27598,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1121 FFV1_0( w_fp[60], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1120] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -27618,7 +27615,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1122 VVV1_0( w_fp[0], w_fp[59], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1121] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -33347,9 +33344,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -33425,8 +33421,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -33488,25 +33483,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -33551,16 +33556,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -33628,6 +33661,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -33637,6 +33671,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -33648,8 +33684,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -33692,7 +33730,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -33711,7 +33749,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -33723,6 +33763,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -33731,9 +33772,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -33744,9 +33786,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -33784,9 +33832,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -33801,7 +33846,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -33849,82 +33895,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -33949,13 +34010,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -33963,7 +34018,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h index 201a432a8a..ee2421cf9a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 2832528673..1240605a6d 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -58,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0018684864044189453  +DEBUG: model prefixing takes 0.003571033477783203  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 0.701 s +1 processes with 1240 diagrams generated in 0.709 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 2.523 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 2.512 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.138 s +ALOHA: aloha creates 5 routines in 0.157 s VVV1 VVV1 FFV1 @@ -200,7 +200,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m8.591s -user 0m5.091s -sys 0m0.309s -Code generation completed in 9 seconds +real 0m7.550s +user 0m5.121s +sys 0m0.217s +Code generation completed in 8 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index cc4e37eaa9..12c45ededb 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -32402,9 +32399,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -32480,8 +32476,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -32543,25 +32538,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -32606,16 +32611,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -32683,6 +32716,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -32692,6 +32726,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -32703,8 +32739,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -32747,7 +32785,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -32766,7 +32804,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -32778,6 +32818,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -32786,9 +32827,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -32799,9 +32841,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -32839,9 +32887,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -32856,7 +32901,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -32904,82 +32950,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -33004,13 +33065,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -33018,7 +33073,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h index 201a432a8a..ee2421cf9a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 2a38116d7d..f2e018386d 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -57,7 +57,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.001802682876586914  +DEBUG: model prefixing takes 0.0024957656860351562  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -166,17 +166,17 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.043 s +8 processes with 40 diagrams generated in 0.069 s Total: 8 processes with 40 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  @@ -214,16 +214,16 @@ INFO: Finding symmetric diagrams for subprocess group gux_ttxux DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1564]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1588]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1589]  -Generated helas calls for 2 subprocesses (10 diagrams) in 0.025 s -Wrote files for 32 helas calls in 2.532 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.049 s +Wrote files for 32 helas calls in 3.302 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.094 s +ALOHA: aloha creates 2 routines in 0.104 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.079 s +ALOHA: aloha creates 4 routines in 0.087 s FFV1 FFV1 FFV1 @@ -244,17 +244,17 @@ INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  +DEBUG: result.returncode =  0 [output.py at line 275]  Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m7.934s -user 0m1.348s -sys 0m0.746s -Code generation completed in 8 seconds +real 0m10.462s +user 0m1.475s +sys 0m0.703s +Code generation completed in 10 seconds /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 35d6d0610c..5091f85849 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -306,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -315,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -411,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -433,10 +434,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -461,7 +458,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -479,7 +476,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -496,7 +493,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -513,7 +510,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[1], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -530,7 +527,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -866,9 +863,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -944,8 +940,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1007,25 +1002,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1070,16 +1075,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1147,6 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1156,6 +1190,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1167,8 +1203,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1211,7 +1249,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1230,7 +1268,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1242,6 +1282,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1250,9 +1291,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1263,9 +1305,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1303,9 +1351,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1320,7 +1365,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1368,82 +1414,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1468,13 +1529,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1482,7 +1537,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index bd42537623..0bf2e4625f 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index cc683b55c1..9a64abfae3 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -306,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -315,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -411,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -433,10 +434,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -461,7 +458,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -479,7 +476,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -496,7 +493,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -513,7 +510,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[4], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -530,7 +527,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -866,9 +863,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -944,8 +940,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1007,25 +1002,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1070,16 +1075,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1147,6 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1156,6 +1190,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1167,8 +1203,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1211,7 +1249,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1230,7 +1268,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1242,6 +1282,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1250,9 +1291,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1263,9 +1305,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1303,9 +1351,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1320,7 +1365,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1368,82 +1414,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1468,13 +1529,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1482,7 +1537,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index dd4aae8a06..9191598e88 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index f3f83367a3..016603b556 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -57,7 +57,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0017499923706054688  +DEBUG: model prefixing takes 0.001806020736694336  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -166,12 +166,12 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.043 s +8 processes with 40 diagrams generated in 0.080 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 @@ -184,29 +184,29 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=1 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=1 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.026 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.037 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.062 s +ALOHA: aloha creates 2 routines in 0.081 s FFV1 FFV1 FFV1 @@ -222,7 +222,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m1.860s -user 0m0.420s -sys 0m0.146s +real 0m2.356s +user 0m0.505s +sys 0m0.153s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc index 1236fdcfcc..0bf3c7da89 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc @@ -306,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -315,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -411,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -433,10 +434,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -846,9 +843,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -924,8 +920,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -987,25 +982,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1050,16 +1055,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1127,6 +1160,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1136,6 +1170,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1147,8 +1183,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1191,7 +1229,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1210,7 +1248,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1222,6 +1262,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1230,9 +1271,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1243,9 +1285,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1283,9 +1331,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1300,7 +1345,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1348,82 +1394,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1448,13 +1509,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1462,7 +1517,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h index bd42537623..0bf2e4625f 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc index d4db88aa57..959f91ae65 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc @@ -306,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -315,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -411,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -433,10 +434,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -846,9 +843,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -924,8 +920,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -987,25 +982,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1050,16 +1055,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1127,6 +1160,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1136,6 +1170,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1147,8 +1183,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1191,7 +1229,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1210,7 +1248,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1222,6 +1262,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1230,9 +1271,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1243,9 +1285,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1283,9 +1331,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1300,7 +1345,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1348,82 +1394,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1448,13 +1509,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1462,7 +1517,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h index dd4aae8a06..9191598e88 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt index fb6910fcf6..11b54f703e 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt @@ -130,10 +130,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --ve Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_heft_gg_bb INFO: remove old information in CODEGEN_mad_heft_gg_bb -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  @@ -152,20 +152,20 @@ INFO: Finding symmetric diagrams for subprocess group gg_bbx DEBUG: len(subproc_diagrams_for_config) =  4 [model_handling.py at line 1564]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1588]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1589]  -Generated helas calls for 1 subprocesses (4 diagrams) in 0.005 s -Wrote files for 12 helas calls in 1.556 s +Generated helas calls for 1 subprocesses (4 diagrams) in 0.011 s +Wrote files for 12 helas calls in 2.377 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.120 s +ALOHA: aloha creates 4 routines in 0.164 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 8 routines in 0.112 s +ALOHA: aloha creates 8 routines in 0.125 s VVS3 VVV1 FFV1 @@ -187,17 +187,17 @@ INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  +DEBUG: result.returncode =  0 [output.py at line 275]  Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. Type "launch" to generate events from this process, or see /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README Run "open index.html" to see more information about this process. quit -real 0m6.775s -user 0m1.308s -sys 0m0.633s -Code generation completed in 7 seconds +real 0m9.223s +user 0m1.254s +sys 0m0.660s +Code generation completed in 9 seconds /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc index 282f710a83..c04f054aa9 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -455,7 +452,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFS2_0( w_fp[3], w_fp[2], w_fp[4], COUPs[ndcoup + 0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -471,7 +468,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -488,7 +485,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -504,7 +501,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -822,9 +819,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -900,8 +896,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -963,25 +958,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1026,16 +1031,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1103,6 +1136,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1112,6 +1146,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1123,8 +1159,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1167,7 +1205,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1186,7 +1224,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1198,6 +1238,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1206,9 +1247,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1219,9 +1261,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1259,9 +1307,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1276,7 +1321,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1324,82 +1370,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1424,13 +1485,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1438,7 +1493,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h index a4c60bf837..bfcb1209e9 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt index 19767c3f2d..13878ae8fd 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt @@ -58,6 +58,54 @@ set auto_convert_model T save options auto_convert_model save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft +INFO: download model from https://madgraph.mi.infn.it/Downloads/models/heft.tgz to the following directory: /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/models  +--2025-12-11 12:36:44-- https://madgraph.mi.infn.it/Downloads/models/heft.tgz +Resolving madgraph.mi.infn.it (madgraph.mi.infn.it)... 192.135.21.75 +Connecting to madgraph.mi.infn.it (madgraph.mi.infn.it)|192.135.21.75|:443... connected. +HTTP request sent, awaiting response... 200 OK +Length: 50876 (50K) [application/x-gzip] +Saving to: ‘tmp.tgz’ + + 0K .......... .......... .......... .......... ......... 100% 3.00M=0.02s + +2025-12-11 12:36:44 (3.00 MB/s) - ‘tmp.tgz’ saved [50876/50876] + +heft/ +heft/write_param_card.py +heft/restrict_ckm.dat +heft/couplings.py +heft/HEFT_UFO.log +heft/lorentz.py +heft/__init__.py +heft/__pycache__/ +heft/particles.py +heft/object_library.py +heft/restrict_default.dat +heft/restrict_zeromass_ckm.dat +heft/restrict_no_b_mass.dat +heft/function_library.py +heft/parameters.py +heft/py3_model.pkl +heft/coupling_orders.py +heft/restrict_no_tau_mass.dat +heft/vertices.py +heft/restrict_no_masses.dat +heft/__pycache__/write_param_card.cpython-311.pyc +heft/__pycache__/parameters.cpython-311.pyc +heft/__pycache__/function_library.cpython-311.pyc +heft/__pycache__/coupling_orders.cpython-311.pyc +heft/__pycache__/object_library.cpython-311.pyc +heft/__pycache__/couplings.cpython-311.pyc +heft/__pycache__/particles.cpython-311.pyc +heft/__pycache__/vertices.cpython-311.pyc +heft/__pycache__/lorentz.cpython-311.pyc +heft/__pycache__/__init__.cpython-311.pyc +INFO: reload from .py file +INFO: load particles +INFO: load vertices +WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  +WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  +DEBUG: model prefixing takes 0.0019080638885498047  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -127,28 +175,28 @@ INFO: Process has 4 diagrams Total: 1 processes with 4 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_bb Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. -Generated helas calls for 1 subprocesses (4 diagrams) in 0.010 s +Generated helas calls for 1 subprocesses (4 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.127 s +ALOHA: aloha creates 4 routines in 0.134 s VVS3 VVV1 FFV1 @@ -165,7 +213,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. quit -real 0m1.577s -user 0m0.495s -sys 0m0.136s -Code generation completed in 1 seconds +real 0m2.583s +user 0m0.606s +sys 0m0.182s +Code generation completed in 3 seconds diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc index 7ae2e2ed53..e97d656ef0 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -806,9 +803,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -884,8 +880,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -947,25 +942,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1010,16 +1015,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1087,6 +1120,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1096,6 +1130,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1107,8 +1143,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1151,7 +1189,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1170,7 +1208,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1182,6 +1222,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1190,9 +1231,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1203,9 +1245,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1243,9 +1291,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1260,7 +1305,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1308,82 +1354,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1408,13 +1469,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1422,7 +1477,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h index a4c60bf837..bfcb1209e9 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt index 912b9ddf90..7f8baeac9e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt @@ -57,7 +57,7 @@ set zerowidth_tchannel F import model sm-no_b_mass INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0017840862274169922  +DEBUG: model prefixing takes 0.0018579959869384766  INFO: Restrict model sm-no_b_mass with file models/sm/restrict_no_b_mass.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -181,7 +181,7 @@ INFO: Process u~ d > t t~ w- added to mirror process d u~ > t t~ w- INFO: Process c~ s > t t~ w- added to mirror process s c~ > t t~ w- INFO: Process d~ u > t t~ w+ added to mirror process u d~ > t t~ w+ INFO: Process s~ c > t t~ w+ added to mirror process c s~ > t t~ w+ -4 processes with 8 diagrams generated in 0.081 s +4 processes with 8 diagrams generated in 0.100 s Total: 4 processes with 8 diagrams add process p p > t t~ w j @1 INFO: Checking for minimal orders which gives processes. @@ -223,17 +223,17 @@ INFO: Process d~ g > t t~ w+ u~ added to mirror process g d~ > t t~ w+ u~ INFO: Process d~ u > t t~ w+ g added to mirror process u d~ > t t~ w+ g INFO: Process s~ g > t t~ w+ c~ added to mirror process g s~ > t t~ w+ c~ INFO: Process s~ c > t t~ w+ g added to mirror process c s~ > t t~ w+ g -12 processes with 144 diagrams generated in 0.265 s +12 processes with 144 diagrams generated in 0.282 s Total: 16 processes with 152 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_nobm_pp_ttW INFO: remove old information in CODEGEN_mad_nobm_pp_ttW -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  @@ -351,18 +351,18 @@ INFO: Finding symmetric diagrams for subprocess group dux_ttxwm DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1564]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1588]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1589]  -Generated helas calls for 8 subprocesses (76 diagrams) in 0.101 s -Wrote files for 212 helas calls in 11.333 s +Generated helas calls for 8 subprocesses (76 diagrams) in 0.122 s +Wrote files for 212 helas calls in 17.074 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 3 routines in 0.099 s +ALOHA: aloha creates 3 routines in 0.141 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 6 routines in 0.101 s +ALOHA: aloha creates 6 routines in 0.115 s FFV1 FFV1 FFV1 @@ -385,17 +385,17 @@ INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  +DEBUG: result.returncode =  0 [output.py at line 275]  Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. Type "launch" to generate events from this process, or see /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README Run "open index.html" to see more information about this process. quit -real 0m18.260s -user 0m2.773s -sys 0m1.309s -Code generation completed in 19 seconds +real 0m25.842s +user 0m2.903s +sys 0m1.311s +Code generation completed in 26 seconds /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc index b04a53b56a..96144a4b95 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc @@ -304,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -313,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -409,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -431,10 +432,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -459,7 +456,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -476,7 +473,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -829,9 +826,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -907,8 +903,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -970,25 +965,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1033,16 +1038,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1110,6 +1143,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1119,6 +1153,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1130,8 +1166,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1174,7 +1212,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1193,7 +1231,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1205,6 +1245,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1213,9 +1254,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1226,9 +1268,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1266,9 +1314,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1283,7 +1328,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1331,82 +1377,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1431,13 +1492,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1445,7 +1500,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h index b9c21cb625..8db125293c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc index a9ff387906..801cc458c3 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc @@ -304,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -313,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -409,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -431,10 +432,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -459,7 +456,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -476,7 +473,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -829,9 +826,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -907,8 +903,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -970,25 +965,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1033,16 +1038,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1110,6 +1143,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1119,6 +1153,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1130,8 +1166,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1174,7 +1212,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1193,7 +1231,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1205,6 +1245,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1213,9 +1254,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1226,9 +1268,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1266,9 +1314,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1283,7 +1328,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1331,82 +1377,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1431,13 +1492,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1445,7 +1500,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h index 43024d30fd..88ccb8aebc 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc index 506791a8b1..733db97179 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc @@ -304,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -313,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -409,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -431,10 +432,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -462,7 +459,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -479,7 +476,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -498,7 +495,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -515,7 +512,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -533,7 +530,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -550,7 +547,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -567,7 +564,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -584,7 +581,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -601,7 +598,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -618,7 +615,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -635,7 +632,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -652,7 +649,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1055,9 +1052,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1133,8 +1129,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1196,25 +1191,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1259,16 +1264,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1336,6 +1369,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1345,6 +1379,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1356,8 +1392,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1400,7 +1438,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1419,7 +1457,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1431,6 +1471,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1439,9 +1480,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1452,9 +1494,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1492,9 +1540,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1509,7 +1554,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1557,82 +1603,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1657,13 +1718,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1671,7 +1726,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h index b168a7dacf..c06c1088d2 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc index 2bccd80866..c3009cc038 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc @@ -304,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -313,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -409,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -431,10 +432,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -462,7 +459,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -479,7 +476,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -498,7 +495,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -515,7 +512,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -533,7 +530,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -550,7 +547,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -567,7 +564,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -584,7 +581,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -601,7 +598,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -618,7 +615,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -635,7 +632,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -652,7 +649,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1055,9 +1052,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1133,8 +1129,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1196,25 +1191,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1259,16 +1264,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1336,6 +1369,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1345,6 +1379,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1356,8 +1392,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1400,7 +1438,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1419,7 +1457,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1431,6 +1471,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1439,9 +1480,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1452,9 +1494,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1492,9 +1540,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1509,7 +1554,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1557,82 +1603,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1657,13 +1718,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1671,7 +1726,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h index daa474c26a..649bf473b3 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc index 1550234c05..bb7db7b6ef 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc @@ -304,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -313,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -409,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -431,10 +432,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -462,7 +459,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -479,7 +476,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -498,7 +495,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -515,7 +512,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -533,7 +530,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -550,7 +547,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -567,7 +564,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -584,7 +581,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -601,7 +598,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -618,7 +615,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -635,7 +632,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -652,7 +649,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1055,9 +1052,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1133,8 +1129,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1196,25 +1191,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1259,16 +1264,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1336,6 +1369,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1345,6 +1379,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1356,8 +1392,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1400,7 +1438,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1419,7 +1457,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1431,6 +1471,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1439,9 +1480,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1452,9 +1494,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1492,9 +1540,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1509,7 +1554,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1557,82 +1603,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1657,13 +1718,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1671,7 +1726,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h index d4d5408ad2..5330725977 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc index e3f2d0c976..d69812222a 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc @@ -304,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -313,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -409,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -431,10 +432,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -462,7 +459,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -479,7 +476,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -498,7 +495,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -515,7 +512,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -533,7 +530,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -550,7 +547,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -567,7 +564,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -584,7 +581,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -601,7 +598,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -618,7 +615,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -635,7 +632,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -652,7 +649,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1055,9 +1052,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1133,8 +1129,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1196,25 +1191,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1259,16 +1264,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1336,6 +1369,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1345,6 +1379,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1356,8 +1392,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1400,7 +1438,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1419,7 +1457,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1431,6 +1471,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1439,9 +1480,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1452,9 +1494,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1492,9 +1540,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1509,7 +1554,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1557,82 +1603,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1657,13 +1718,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1671,7 +1726,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h index 6aaf443f35..362cd39944 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc index fd9d2b525b..c9b72712e9 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc @@ -304,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -313,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -409,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -431,10 +432,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -462,7 +459,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -479,7 +476,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -498,7 +495,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -515,7 +512,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -533,7 +530,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -550,7 +547,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -567,7 +564,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -584,7 +581,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -601,7 +598,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -618,7 +615,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -635,7 +632,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -652,7 +649,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1055,9 +1052,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1133,8 +1129,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1196,25 +1191,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1259,16 +1264,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1336,6 +1369,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1345,6 +1379,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1356,8 +1392,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1400,7 +1438,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1419,7 +1457,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1431,6 +1471,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1439,9 +1480,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1452,9 +1494,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1492,9 +1540,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1509,7 +1554,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1557,82 +1603,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1657,13 +1718,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1671,7 +1726,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h index a4f9928134..dd4e039f85 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc index 61ace6e710..7ad6ebf5cb 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc @@ -304,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -313,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -409,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -431,10 +432,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -462,7 +459,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -479,7 +476,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -498,7 +495,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -515,7 +512,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -533,7 +530,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -550,7 +547,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -567,7 +564,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -584,7 +581,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -601,7 +598,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -618,7 +615,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -635,7 +632,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -652,7 +649,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1055,9 +1052,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1133,8 +1129,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1196,25 +1191,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1259,16 +1264,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1336,6 +1369,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1345,6 +1379,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1356,8 +1392,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1400,7 +1438,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1419,7 +1457,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1431,6 +1471,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1439,9 +1480,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1452,9 +1494,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1492,9 +1540,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1509,7 +1554,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1557,82 +1603,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1657,13 +1718,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1671,7 +1726,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h index 7d990b3bbd..2052fbd364 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index 864a458f2c..c814985da4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -57,7 +57,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.001878499984741211  +DEBUG: model prefixing takes 0.0018219947814941406  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -168,7 +168,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.039 s +5 processes with 7 diagrams generated in 0.057 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -208,7 +208,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.082 s +13 processes with 76 diagrams generated in 0.136 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -374,17 +374,17 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 0.814 s +65 processes with 1119 diagrams generated in 0.886 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  @@ -689,22 +689,22 @@ INFO: Finding symmetric diagrams for subprocess group uux_ttx DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1564]  DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1588]  DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1589]  -Generated helas calls for 18 subprocesses (372 diagrams) in 0.593 s -Wrote files for 810 helas calls in 25.690 s +Generated helas calls for 18 subprocesses (372 diagrams) in 0.701 s +Wrote files for 810 helas calls in 44.199 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.178 s +ALOHA: aloha creates 5 routines in 0.191 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.136 s +ALOHA: aloha creates 10 routines in 0.178 s VVV1 VVV1 FFV1 @@ -732,17 +732,17 @@ INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  +DEBUG: result.returncode =  0 [output.py at line 275]  Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m36.427s -user 0m5.481s -sys 0m2.328s -Code generation completed in 38 seconds +real 0m57.768s +user 0m6.485s +sys 0m2.494s +Code generation completed in 58 seconds /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc index aa1147423f..02f5d7a8eb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -455,7 +452,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -472,7 +469,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -488,7 +485,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -805,9 +802,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -883,8 +879,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -946,25 +941,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1009,16 +1014,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1086,6 +1119,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1095,6 +1129,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1106,8 +1142,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1150,7 +1188,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1169,7 +1207,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1181,6 +1221,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1189,9 +1230,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1202,9 +1244,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1242,9 +1290,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1259,7 +1304,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1307,82 +1353,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1407,13 +1468,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1421,7 +1476,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h index 3100207a11..04a1595fd1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc index ed592e4e1a..791fdf32bd 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc @@ -306,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -315,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -411,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -433,10 +434,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -458,7 +455,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -776,9 +773,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -854,8 +850,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -917,25 +912,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -980,16 +985,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1057,6 +1090,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1066,6 +1100,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1077,8 +1113,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1121,7 +1159,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1140,7 +1178,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1152,6 +1192,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1160,9 +1201,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1173,9 +1215,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1213,9 +1261,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1230,7 +1275,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1278,82 +1324,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1378,13 +1439,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1392,7 +1447,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h index bb3daa0e4d..36998a8fa1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index c2f3ee7141..b8f69df605 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -458,7 +455,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -477,7 +474,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -494,7 +491,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -512,7 +509,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -528,7 +525,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -545,7 +542,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -562,7 +559,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -578,7 +575,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -595,7 +592,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -611,7 +608,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -628,7 +625,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -645,7 +642,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -664,7 +661,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[12] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -680,7 +677,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[13] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -696,7 +693,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[14] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1058,9 +1055,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1136,8 +1132,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1199,25 +1194,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1262,16 +1267,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1339,6 +1372,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1348,6 +1382,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1359,8 +1395,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1403,7 +1441,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1422,7 +1460,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1434,6 +1474,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1442,9 +1483,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1455,9 +1497,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1495,9 +1543,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1512,7 +1557,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1560,82 +1606,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1660,13 +1721,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1674,7 +1729,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 1b49cac30b..1b956214b7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 60b69e61d0..7c3b3f4b4a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -306,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -315,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -411,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -433,10 +434,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -461,7 +458,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[4], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -479,7 +476,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -496,7 +493,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -513,7 +510,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[1], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -530,7 +527,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -866,9 +863,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -944,8 +940,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1007,25 +1002,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1070,16 +1075,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1147,6 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1156,6 +1190,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1167,8 +1203,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1211,7 +1249,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1230,7 +1268,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1242,6 +1282,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1250,9 +1291,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1263,9 +1305,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1303,9 +1351,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1320,7 +1365,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1368,82 +1414,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1468,13 +1529,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1482,7 +1537,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index bd42537623..0bf2e4625f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index d1a34b8ade..36ef0f1276 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -306,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -315,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -411,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -433,10 +434,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -461,7 +458,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -479,7 +476,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -496,7 +493,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -513,7 +510,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[4], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -530,7 +527,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -866,9 +863,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -944,8 +940,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1007,25 +1002,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1070,16 +1075,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1147,6 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1156,6 +1190,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1167,8 +1203,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1211,7 +1249,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1230,7 +1268,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1242,6 +1282,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1250,9 +1291,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1263,9 +1305,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1303,9 +1351,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1320,7 +1365,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1368,82 +1414,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1468,13 +1529,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1482,7 +1537,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index dd4aae8a06..9191598e88 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc index ec76c63604..ab7500dca5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc @@ -306,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -315,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -411,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -433,10 +434,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -461,7 +458,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -479,7 +476,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -496,7 +493,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -513,7 +510,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[0], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -530,7 +527,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[4], w_fp[7], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -866,9 +863,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -944,8 +940,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1007,25 +1002,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1070,16 +1075,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1147,6 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1156,6 +1190,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1167,8 +1203,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1211,7 +1249,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1230,7 +1268,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1242,6 +1282,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1250,9 +1291,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1263,9 +1305,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1303,9 +1351,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1320,7 +1365,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1368,82 +1414,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1468,13 +1529,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1482,7 +1537,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h index 46d25105cc..514325b407 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc index c529cd5dd7..c5593feede 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -494,7 +491,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 VVV1_0( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -517,7 +514,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -540,7 +537,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 VVV1_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -564,7 +561,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -581,7 +578,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -600,7 +597,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -617,7 +614,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -634,7 +631,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -653,7 +650,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -670,7 +667,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -687,7 +684,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -706,7 +703,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[12] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -723,7 +720,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[13] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -742,7 +739,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[14] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -761,7 +758,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[15] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -782,7 +779,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[16] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -798,7 +795,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[17] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -814,7 +811,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[18] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -832,7 +829,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[19] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -851,7 +848,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[20] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -868,7 +865,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[21] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -885,7 +882,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 VVV1_0( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[22] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -904,7 +901,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[23] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -921,7 +918,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 FFV1_0( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[24] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -938,7 +935,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[25] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -954,7 +951,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[26] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -970,7 +967,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[27] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -986,7 +983,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[28] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1002,7 +999,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[29] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1019,7 +1016,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 VVV1_0( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[30] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1064,7 +1061,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[32] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1080,7 +1077,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 34 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[33] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1096,7 +1093,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 FFV1_0( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[34] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1113,7 +1110,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[35] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1132,7 +1129,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 37 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[36] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1149,7 +1146,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 38 FFV1_0( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[37] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1166,7 +1163,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 39 VVV1_0( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[38] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1185,7 +1182,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 40 FFV1_0( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[39] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1202,7 +1199,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 41 FFV1_0( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[40] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1219,7 +1216,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 42 FFV1_0( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[41] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1235,7 +1232,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 43 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[42] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1251,7 +1248,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 44 FFV1_0( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[43] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1267,7 +1264,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 45 FFV1_0( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[44] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1283,7 +1280,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 46 FFV1_0( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[45] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1300,7 +1297,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 47 VVV1_0( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[46] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1342,7 +1339,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 49 FFV1_0( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[48] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1359,7 +1356,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 50 FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[49] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1378,7 +1375,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 51 FFV1_0( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[50] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1395,7 +1392,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 52 FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[51] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1412,7 +1409,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 53 FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[52] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1431,7 +1428,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 54 FFV1_0( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[53] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1448,7 +1445,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 55 FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[54] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1467,7 +1464,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 56 FFV1_0( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[55] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1486,7 +1483,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 57 VVV1_0( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[56] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1543,7 +1540,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 59 VVV1_0( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[58] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1566,7 +1563,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 60 VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[59] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1589,7 +1586,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 61 FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[60] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1608,7 +1605,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 62 FFV1_0( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[61] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1625,7 +1622,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 63 FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[62] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1644,7 +1641,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 64 FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[63] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1662,7 +1659,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 65 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[64] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1679,7 +1676,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 66 FFV1_0( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[65] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1698,7 +1695,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 67 FFV1_0( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[66] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1715,7 +1712,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 68 FFV1_0( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[67] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1732,7 +1729,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 69 FFV1_0( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[68] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1751,7 +1748,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 70 FFV1_0( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[69] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1768,7 +1765,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 71 FFV1_0( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[70] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1787,7 +1784,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 72 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[71] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1806,7 +1803,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 73 VVV1_0( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[72] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1863,7 +1860,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 75 VVV1_0( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[74] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1886,7 +1883,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 76 VVV1_0( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[75] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1909,7 +1906,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 77 FFV1_0( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[76] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1928,7 +1925,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 78 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[77] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1945,7 +1942,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 79 FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[78] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1964,7 +1961,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 80 FFV1_0( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[79] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1981,7 +1978,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 81 FFV1_0( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[80] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1997,7 +1994,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 82 FFV1_0( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[81] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2013,7 +2010,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 83 FFV1_0( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[82] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2029,7 +2026,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 84 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[83] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2045,7 +2042,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 85 FFV1_0( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[84] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2062,7 +2059,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 86 FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[85] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2081,7 +2078,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 87 FFV1_0( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[86] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2097,7 +2094,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 88 FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[87] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2113,7 +2110,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 89 FFV1_0( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[88] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2129,7 +2126,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 90 FFV1_0( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[89] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2145,7 +2142,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 91 FFV1_0( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[90] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2162,7 +2159,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 92 FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[91] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2215,7 +2212,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 94 VVV1_0( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[93] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2238,7 +2235,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 95 VVV1_0( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[94] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2261,7 +2258,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 96 FFV1_0( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[95] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2280,7 +2277,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 97 FFV1_0( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[96] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2297,7 +2294,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 98 FFV1_0( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[97] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2316,7 +2313,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 99 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[98] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2367,7 +2364,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 101 VVV1_0( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[100] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2390,7 +2387,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 102 VVV1_0( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[101] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2413,7 +2410,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 103 FFV1_0( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[102] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2432,7 +2429,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 104 FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[103] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2449,7 +2446,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 105 FFV1_0( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[104] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2468,7 +2465,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 106 FFV1_0( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[105] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2519,7 +2516,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 108 VVV1_0( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[107] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2542,7 +2539,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 109 VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[108] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2565,7 +2562,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 110 FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[109] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2581,7 +2578,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 111 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[110] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2597,7 +2594,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 112 FFV1_0( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[111] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2613,7 +2610,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 113 FFV1_0( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[112] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -3256,9 +3253,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -3334,8 +3330,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -3397,25 +3392,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -3460,16 +3465,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -3537,6 +3570,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -3546,6 +3580,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -3557,8 +3593,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -3601,7 +3639,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -3620,7 +3658,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -3632,6 +3672,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -3640,9 +3681,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -3653,9 +3695,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3693,9 +3741,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -3710,7 +3755,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -3758,82 +3804,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -3858,13 +3919,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -3872,7 +3927,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h index 0c3370cd1c..c0d59a27ea 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc index b5ba7190b8..d133fb651d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc @@ -306,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -315,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -411,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -433,10 +434,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -464,7 +461,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -483,7 +480,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -502,7 +499,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -521,7 +518,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -540,7 +537,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -560,7 +557,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -578,7 +575,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -595,7 +592,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -612,7 +609,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -629,7 +626,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 VVV1_0( w_fp[1], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -647,7 +644,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -664,7 +661,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -681,7 +678,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[12], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[12] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -698,7 +695,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[13] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -715,7 +712,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[14] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -733,7 +730,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[15] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -750,7 +747,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[16] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -767,7 +764,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[17] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -784,7 +781,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[18] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -801,7 +798,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[19] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -819,7 +816,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[20] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -836,7 +833,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[21] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -853,7 +850,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[22] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -870,7 +867,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[23] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -887,7 +884,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 VVV1_0( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[24] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -904,7 +901,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[25] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -921,7 +918,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[26] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -938,7 +935,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[27] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -955,7 +952,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[28] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -972,7 +969,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[29] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -989,7 +986,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[30] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1006,7 +1003,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 32 FFV1_0( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[31] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1023,7 +1020,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[12], w_fp[4], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[32] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1062,7 +1059,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 VVV1_0( w_fp[1], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[34] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1081,7 +1078,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[35] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1453,9 +1450,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1531,8 +1527,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1594,25 +1589,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1657,16 +1662,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1734,6 +1767,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1743,6 +1777,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1754,8 +1790,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1798,7 +1836,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1817,7 +1855,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1829,6 +1869,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1837,9 +1878,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1850,9 +1892,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1890,9 +1938,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1907,7 +1952,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1955,82 +2001,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -2055,13 +2116,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -2069,7 +2124,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h index 471c526c49..abcc2d6233 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc index 4b9f028e6d..e2ac5942a4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc @@ -306,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -315,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -411,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -433,10 +434,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -464,7 +461,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[1], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -483,7 +480,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -502,7 +499,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -521,7 +518,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -540,7 +537,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -560,7 +557,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -578,7 +575,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[1], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -595,7 +592,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[5], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -612,7 +609,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -629,7 +626,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 VVV1_0( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -647,7 +644,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -664,7 +661,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[1], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -681,7 +678,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[12], w_fp[5], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[12] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -698,7 +695,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[13] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -715,7 +712,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[14] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -733,7 +730,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[15] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -750,7 +747,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[16] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -767,7 +764,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[17] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -784,7 +781,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[1], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[18] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -801,7 +798,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[19] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -819,7 +816,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[20] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -836,7 +833,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[21] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -853,7 +850,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[22] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -870,7 +867,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[9], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[23] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -887,7 +884,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[24] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -904,7 +901,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[25] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -921,7 +918,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[26] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -938,7 +935,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[27] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -955,7 +952,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[28] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -972,7 +969,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[1], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[29] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -989,7 +986,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[1], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[30] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1006,7 +1003,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 32 FFV1_0( w_fp[10], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[31] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1023,7 +1020,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[12], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[32] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1062,7 +1059,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 VVV1_0( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[34] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1081,7 +1078,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[35] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1453,9 +1450,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1531,8 +1527,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1594,25 +1589,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1657,16 +1662,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1734,6 +1767,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1743,6 +1777,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1754,8 +1790,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1798,7 +1836,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1817,7 +1855,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1829,6 +1869,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1837,9 +1878,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1850,9 +1892,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1890,9 +1938,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1907,7 +1952,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1955,82 +2001,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -2055,13 +2116,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -2069,7 +2124,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h index 0afe32f972..cedb7fcb32 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc index 451245bb01..58def243d0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc @@ -306,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -315,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -411,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -433,10 +434,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -464,7 +461,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -483,7 +480,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -502,7 +499,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -521,7 +518,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -540,7 +537,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -560,7 +557,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -578,7 +575,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -595,7 +592,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -612,7 +609,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -629,7 +626,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 VVV1_0( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -647,7 +644,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -664,7 +661,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -681,7 +678,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[12] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -698,7 +695,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[13] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -715,7 +712,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[14] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -733,7 +730,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[15] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -750,7 +747,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[16] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -767,7 +764,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[17] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -784,7 +781,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[18] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -801,7 +798,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[19] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -819,7 +816,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[20] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -836,7 +833,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[21] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -853,7 +850,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[22] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -870,7 +867,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[23] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -887,7 +884,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[24] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -904,7 +901,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[25] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -921,7 +918,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[26] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -938,7 +935,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[27] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -955,7 +952,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[28] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -972,7 +969,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[29] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -989,7 +986,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[30] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1006,7 +1003,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 32 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[31] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1023,7 +1020,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[32] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1062,7 +1059,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 VVV1_0( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[34] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1081,7 +1078,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[35] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1453,9 +1450,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1531,8 +1527,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1594,25 +1589,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1657,16 +1662,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1734,6 +1767,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1743,6 +1777,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1754,8 +1790,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1798,7 +1836,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1817,7 +1855,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1829,6 +1869,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1837,9 +1878,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1850,9 +1892,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1890,9 +1938,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1907,7 +1952,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1955,82 +2001,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -2055,13 +2116,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -2069,7 +2124,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h index 949f0f0e2a..a7b234154a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc index 772392b151..cc26513453 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc @@ -308,7 +308,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -317,7 +317,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -413,7 +413,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -435,10 +436,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -466,7 +463,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -485,7 +482,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -504,7 +501,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -521,7 +518,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[5], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -540,7 +537,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[1], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -559,7 +556,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -578,7 +575,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -950,9 +947,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1028,8 +1024,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1091,25 +1086,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1154,16 +1159,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1231,6 +1264,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1240,6 +1274,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1251,8 +1287,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1295,7 +1333,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1314,7 +1352,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1326,6 +1366,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1334,9 +1375,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1347,9 +1389,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1387,9 +1435,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1404,7 +1449,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1452,82 +1498,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1552,13 +1613,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1566,7 +1621,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h index 25aacba743..23cb81ba6a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h @@ -168,6 +168,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -176,6 +177,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -192,6 +195,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -199,6 +203,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc index 111d7bd91e..9603b4f631 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc @@ -314,7 +314,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -323,7 +323,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -419,7 +419,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -441,10 +442,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -472,7 +469,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -491,7 +488,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -510,7 +507,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -527,7 +524,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -546,7 +543,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -565,7 +562,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -584,7 +581,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -956,9 +953,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1034,8 +1030,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1097,25 +1092,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1160,16 +1165,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1237,6 +1270,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1246,6 +1280,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1257,8 +1293,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1301,7 +1339,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1320,7 +1358,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1332,6 +1372,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1340,9 +1381,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1353,9 +1395,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1393,9 +1441,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1410,7 +1455,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1458,82 +1504,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1558,13 +1619,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1572,7 +1627,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h index ac3df5ae1d..05b5116162 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h @@ -174,6 +174,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -182,6 +183,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -198,6 +201,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -205,6 +209,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc index 1f57233a6f..a72a6225b4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc @@ -306,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -315,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -411,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -433,10 +434,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -464,7 +461,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -483,7 +480,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -502,7 +499,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -519,7 +516,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[5], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -538,7 +535,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[1], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -559,7 +556,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -578,7 +575,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -597,7 +594,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 VVV1_0( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -614,7 +611,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -633,7 +630,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[1], w_fp[10], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -652,7 +649,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[10], w_fp[5], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -671,7 +668,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[10], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -690,7 +687,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[6], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[12] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -709,7 +706,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[13] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1081,9 +1078,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1159,8 +1155,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1222,25 +1217,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1285,16 +1290,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1362,6 +1395,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1371,6 +1405,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1382,8 +1418,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1426,7 +1464,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1445,7 +1483,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1457,6 +1497,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1465,9 +1506,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1478,9 +1520,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1518,9 +1566,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1535,7 +1580,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1583,82 +1629,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1683,13 +1744,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1697,7 +1752,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h index 8af0c5a78c..70a92da32a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc index e2779260ff..83505e62ba 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc @@ -314,7 +314,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -323,7 +323,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -419,7 +419,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -441,10 +442,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -472,7 +469,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -491,7 +488,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -510,7 +507,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -527,7 +524,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -546,7 +543,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -565,7 +562,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -584,7 +581,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[3], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -956,9 +953,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1034,8 +1030,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1097,25 +1092,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1160,16 +1165,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1237,6 +1270,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1246,6 +1280,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1257,8 +1293,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1301,7 +1339,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1320,7 +1358,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1332,6 +1372,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1340,9 +1381,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1353,9 +1395,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1393,9 +1441,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1410,7 +1455,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1458,82 +1504,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1558,13 +1619,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1572,7 +1627,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h index f37d8b5515..b7ffff9d65 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h @@ -174,6 +174,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -182,6 +183,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -198,6 +201,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -205,6 +209,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc index c105c712fd..4c8f471aba 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc @@ -306,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -315,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -411,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -433,10 +434,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -464,7 +461,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[0], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -483,7 +480,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -502,7 +499,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -521,7 +518,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -540,7 +537,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -560,7 +557,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -578,7 +575,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[0], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -595,7 +592,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -612,7 +609,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -629,7 +626,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 VVV1_0( w_fp[5], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -647,7 +644,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -664,7 +661,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[0], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -681,7 +678,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[12] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -698,7 +695,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[13] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -715,7 +712,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[5], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[14] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -733,7 +730,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[15] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -750,7 +747,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[16] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -767,7 +764,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[17] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -784,7 +781,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[0], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[18] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -801,7 +798,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[5], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[19] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -819,7 +816,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[20] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -836,7 +833,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[21] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -853,7 +850,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[22] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -870,7 +867,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[23] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -887,7 +884,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 VVV1_0( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[24] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -904,7 +901,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[25] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -921,7 +918,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[26] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -938,7 +935,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[27] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -955,7 +952,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[28] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -972,7 +969,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[0], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[29] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -989,7 +986,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[30] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1006,7 +1003,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 32 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[31] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1023,7 +1020,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[32] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1062,7 +1059,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 VVV1_0( w_fp[5], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[34] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1081,7 +1078,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[35] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1453,9 +1450,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1531,8 +1527,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1594,25 +1589,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1657,16 +1662,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1734,6 +1767,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1743,6 +1777,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1754,8 +1790,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1798,7 +1836,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1817,7 +1855,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1829,6 +1869,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1837,9 +1878,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1850,9 +1892,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1890,9 +1938,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1907,7 +1952,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1955,82 +2001,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -2055,13 +2116,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -2069,7 +2124,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h index 311a96d812..8be5530c1c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc index 915207bda3..25cb87562c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc @@ -306,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -315,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -411,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -433,10 +434,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -464,7 +461,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -483,7 +480,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -502,7 +499,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -519,7 +516,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -538,7 +535,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -559,7 +556,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -578,7 +575,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -597,7 +594,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 VVV1_0( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -614,7 +611,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -633,7 +630,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -652,7 +649,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -671,7 +668,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[10], w_fp[4], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -690,7 +687,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[6], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[12] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -709,7 +706,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[13] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1081,9 +1078,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1159,8 +1155,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1222,25 +1217,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1285,16 +1290,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1362,6 +1395,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1371,6 +1405,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1382,8 +1418,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1426,7 +1464,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1445,7 +1483,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1457,6 +1497,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1465,9 +1506,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1478,9 +1520,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1518,9 +1566,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1535,7 +1580,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1583,82 +1629,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1683,13 +1744,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1697,7 +1752,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h index 75597d043e..13ce403cae 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc index 895b1674ac..5096994855 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc @@ -308,7 +308,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -317,7 +317,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -413,7 +413,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -435,10 +436,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -466,7 +463,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -485,7 +482,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -504,7 +501,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -521,7 +518,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -540,7 +537,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -559,7 +556,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[0], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -578,7 +575,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[3], w_fp[0], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -950,9 +947,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1028,8 +1024,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1091,25 +1086,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1154,16 +1159,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1231,6 +1264,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1240,6 +1274,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1251,8 +1287,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1295,7 +1333,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1314,7 +1352,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1326,6 +1366,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1334,9 +1375,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1347,9 +1389,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1387,9 +1435,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1404,7 +1449,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1452,82 +1498,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1552,13 +1613,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1566,7 +1621,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h index ebbade848b..f4abe8c1e9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h @@ -168,6 +168,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -176,6 +177,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -192,6 +195,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -199,6 +203,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc index 10ccb38efa..44dceb663b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc @@ -306,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -315,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -411,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -433,10 +434,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -464,7 +461,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -483,7 +480,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -502,7 +499,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -519,7 +516,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -538,7 +535,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -559,7 +556,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -578,7 +575,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -597,7 +594,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 VVV1_0( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -614,7 +611,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[0], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -633,7 +630,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -652,7 +649,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -671,7 +668,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[10], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -690,7 +687,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[6], w_fp[0], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[12] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -709,7 +706,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[10], w_fp[0], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[13] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1081,9 +1078,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1159,8 +1155,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1222,25 +1217,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1285,16 +1290,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1362,6 +1395,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1371,6 +1405,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1382,8 +1418,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1426,7 +1464,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1445,7 +1483,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1457,6 +1497,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1465,9 +1506,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1478,9 +1520,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1518,9 +1566,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1535,7 +1580,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1583,82 +1629,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1683,13 +1744,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1697,7 +1752,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h index 17c9c0faf1..e4b749f215 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt index 0986c3df28..ddfce4015b 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt @@ -73,7 +73,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.0343935489654541  +DEBUG: model prefixing takes 0.034606218338012695  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -88,17 +88,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 1.421 s +1 processes with 72 diagrams generated in 1.407 s Total: 1 processes with 72 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_smeft_gg_tttt INFO: remove old information in CODEGEN_mad_smeft_gg_tttt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  @@ -117,22 +117,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxttx DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1564]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1588]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1589]  -Generated helas calls for 1 subprocesses (72 diagrams) in 0.069 s -Wrote files for 119 helas calls in 1.641 s +Generated helas calls for 1 subprocesses (72 diagrams) in 0.068 s +Wrote files for 119 helas calls in 3.822 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.144 s +ALOHA: aloha creates 5 routines in 0.160 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 10 routines in 0.137 s +ALOHA: aloha creates 10 routines in 0.152 s VVV5 VVV5 FFV1 @@ -157,17 +157,17 @@ INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  +DEBUG: result.returncode =  0 [output.py at line 275]  Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. Type "launch" to generate events from this process, or see /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README Run "open index.html" to see more information about this process. quit -real 0m9.061s -user 0m3.272s -sys 0m0.694s -Code generation completed in 9 seconds +real 0m13.208s +user 0m3.391s +sys 0m0.650s +Code generation completed in 13 seconds /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc index 1eb18a90d3..7344868d8d 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -461,7 +458,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -480,7 +477,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -499,7 +496,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV5_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -519,7 +516,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -538,7 +535,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[3], w_fp[8], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -557,7 +554,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 VVV5_0( w_fp[6], w_fp[11], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -576,7 +573,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[6] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -595,7 +592,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[7] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -614,7 +611,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[13], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[8] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -633,7 +630,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[9] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -654,7 +651,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[10] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -671,7 +668,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[13], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[11] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -689,7 +686,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[5], w_fp[6], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[12] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -706,7 +703,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[13] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -723,7 +720,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 FFV1_0( w_fp[15], w_fp[4], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[14] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -740,7 +737,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[15], w_fp[12], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[15] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -757,7 +754,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[5], w_fp[16], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[16] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -774,7 +771,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 VVV5_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[17] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -791,7 +788,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[18] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -808,7 +805,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV5_0( w_fp[1], w_fp[10], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[19] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -827,7 +824,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[5], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[20] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -844,7 +841,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[14], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[21] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -861,7 +858,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 FFV1_0( w_fp[5], w_fp[6], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[22] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -878,7 +875,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[14], w_fp[6], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[23] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -895,7 +892,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 FFV1_0( w_fp[15], w_fp[4], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[24] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -912,7 +909,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[25] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -929,7 +926,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[17], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[26] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -946,7 +943,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 VVV5_0( w_fp[1], w_fp[11], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[27] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -963,7 +960,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[17], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[28] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -980,7 +977,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 VVV5_0( w_fp[1], w_fp[10], w_fp[12], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[29] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -998,7 +995,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[5], w_fp[16], w_fp[17], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[30] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1015,7 +1012,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 32 FFV1_0( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[31] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1032,7 +1029,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[32] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1049,7 +1046,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 34 FFV1_0( w_fp[13], w_fp[12], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[33] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1066,7 +1063,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 FFV1_0( w_fp[15], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[34] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1083,7 +1080,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 FFV1_0( w_fp[15], w_fp[12], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[35] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1100,7 +1097,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 37 FFV1_0( w_fp[5], w_fp[14], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[36] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1117,7 +1114,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 38 VVV5_0( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[37] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1134,7 +1131,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 39 FFV1_0( w_fp[3], w_fp[14], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[38] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1151,7 +1148,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 40 VVV5_0( w_fp[1], w_fp[11], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[39] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1169,7 +1166,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 41 FFV1_0( w_fp[3], w_fp[16], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[40] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1186,7 +1183,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 42 FFV1_0( w_fp[17], w_fp[16], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[41] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1203,7 +1200,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 43 FFV1_0( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[42] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1220,7 +1217,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 44 FFV1_0( w_fp[13], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[43] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1237,7 +1234,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 45 FFV1_0( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[44] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1254,7 +1251,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 46 FFV1_0( w_fp[17], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[45] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1271,7 +1268,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 47 FFV1_0( w_fp[12], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[46] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1288,7 +1285,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 48 VVV5_0( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[47] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1305,7 +1302,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 49 FFV1_0( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[48] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1322,7 +1319,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 50 VVV5_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[49] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1339,7 +1336,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 51 FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[50] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1356,7 +1353,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 52 FFV1_0( w_fp[5], w_fp[16], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[51] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1373,7 +1370,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 53 FFV1_0( w_fp[3], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[52] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1390,7 +1387,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 54 FFV1_0( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[53] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1407,7 +1404,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 55 FFV1_0( w_fp[16], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[54] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1424,7 +1421,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 56 FFV1_0( w_fp[13], w_fp[4], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[55] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1441,7 +1438,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 57 FFV1_0( w_fp[16], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[56] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1458,7 +1455,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 58 FFV1_0( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[57] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1475,7 +1472,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 59 FFV1_0( w_fp[5], w_fp[13], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[58] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1492,7 +1489,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 60 FFV1_0( w_fp[5], w_fp[6], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[59] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1509,7 +1506,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 61 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[60] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1526,7 +1523,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 62 FFV1_0( w_fp[3], w_fp[6], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[61] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1543,7 +1540,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 63 FFV1_0( w_fp[6], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[62] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1560,7 +1557,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 64 FFV1_0( w_fp[15], w_fp[4], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[63] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1577,7 +1574,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 65 FFV1_0( w_fp[6], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[64] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1594,7 +1591,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 66 FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[65] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1633,7 +1630,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 68 VVV5_0( w_fp[1], w_fp[10], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[67] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1652,7 +1649,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 69 VVV5_0( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[68] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1693,7 +1690,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 71 VVV5_0( w_fp[1], w_fp[8], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[70] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -1712,7 +1709,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 72 VVV5_0( w_fp[1], w_fp[11], w_fp[12], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[71] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -2084,9 +2081,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -2162,8 +2158,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -2225,25 +2220,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -2288,16 +2293,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -2365,6 +2398,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -2374,6 +2408,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -2385,8 +2421,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -2429,7 +2467,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -2448,7 +2486,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -2460,6 +2500,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -2468,9 +2509,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -2481,9 +2523,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -2521,9 +2569,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -2538,7 +2583,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -2586,82 +2632,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -2686,13 +2747,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -2700,7 +2755,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h index f8f13801dd..7e444f2546 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt index f6defe3d98..239e177d5b 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt @@ -58,6 +58,41 @@ set auto_convert_model T save options auto_convert_model save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t +INFO: download model from http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz to the following directory: /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/models  +--2025-12-11 12:38:40-- http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz +Resolving feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)... 130.104.48.109 +Connecting to feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)|130.104.48.109|:80... connected. +HTTP request sent, awaiting response... 200 Ok +Length: 80562 (79K) [application/x-tar] +Saving to: ‘tmp.tgz’ + + 0K .......... .......... .......... .......... .......... 63% 880K 0s + 50K .......... .......... ........ 100% 1.37M=0.08s + +2025-12-11 12:38:40 (1018 KB/s) - ‘tmp.tgz’ saved [80562/80562] + +SMEFTsim_topU3l_MwScheme_UFO/ +SMEFTsim_topU3l_MwScheme_UFO/__init__.py +SMEFTsim_topU3l_MwScheme_UFO/param_card_massless.dat +SMEFTsim_topU3l_MwScheme_UFO/CT_couplings.py +SMEFTsim_topU3l_MwScheme_UFO/particles.py +SMEFTsim_topU3l_MwScheme_UFO/write_param_card.py +SMEFTsim_topU3l_MwScheme_UFO/decays.py +SMEFTsim_topU3l_MwScheme_UFO/parameters.py +SMEFTsim_topU3l_MwScheme_UFO/restrict_massless.dat +SMEFTsim_topU3l_MwScheme_UFO/object_library.py +SMEFTsim_topU3l_MwScheme_UFO/coupling_orders.py +SMEFTsim_topU3l_MwScheme_UFO/version.info +SMEFTsim_topU3l_MwScheme_UFO/function_library.py +SMEFTsim_topU3l_MwScheme_UFO/couplings.py +SMEFTsim_topU3l_MwScheme_UFO/propagators.py +SMEFTsim_topU3l_MwScheme_UFO/lorentz.py +SMEFTsim_topU3l_MwScheme_UFO/vertices.py +SMEFTsim_topU3l_MwScheme_UFO/restrict_SMlimit_massless.dat +fail to load model but auto_convert_model is on True. Trying to convert the model +convert model /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/models/SMEFTsim_topU3l_MwScheme_UFO +retry the load of the model +import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices CRITICAL: Model with non QCD emission of gluon (found 14 of those). @@ -73,7 +108,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.034352779388427734  +DEBUG: model prefixing takes 0.03392601013183594  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -82,27 +117,30 @@ Defined multiparticle l- = e- mu- Defined multiparticle vl = ve vm vt Defined multiparticle vl~ = ve~ vm~ vt~ Defined multiparticle all = g a ve vm vt ve~ vm~ vt~ u c t d s b t1 u~ c~ t~ d~ s~ b~ t1~ z w+ z1 w1+ h h1 w- w1- e- mu- ta- e+ mu+ ta+ +INFO: Change particles name to pass to MG5 convention +Kept definitions of multiparticles p / j / l+ / l- / vl / vl~ unchanged +Defined multiparticle all = g a ve vm vt ve~ vm~ vt~ u c t d s b t1 u~ c~ t~ d~ s~ b~ t1~ z w+ z1 w1+ h h1 w- w1- e- mu- ta- e+ mu+ ta+ generate g g > t t~ t t~ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 1.410 s +1 processes with 72 diagrams generated in 1.441 s Total: 1 processes with 72 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc @@ -114,7 +152,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.139 s +ALOHA: aloha creates 5 routines in 0.148 s VVV5 VVV5 FFV1 @@ -134,7 +172,7 @@ INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SME INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. quit -real 0m3.134s -user 0m2.098s -sys 0m0.123s -Code generation completed in 3 seconds +real 0m4.179s +user 0m2.251s +sys 0m0.181s +Code generation completed in 4 seconds diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc index 6d552137f3..d7b2ade5f4 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -1822,9 +1819,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1900,8 +1896,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1963,25 +1958,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -2026,16 +2031,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -2103,6 +2136,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -2112,6 +2146,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -2123,8 +2159,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -2167,7 +2205,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -2186,7 +2224,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -2198,6 +2238,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -2206,9 +2247,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -2219,9 +2261,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -2259,9 +2307,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -2276,7 +2321,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -2324,82 +2370,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -2424,13 +2485,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -2438,7 +2493,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h index f8f13801dd..7e444f2546 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 30a2e623b3..cab1a5820a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -550,17 +550,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.079 s +1 processes with 6 diagrams generated in 0.054 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_susy_gg_t1t1 INFO: remove old information in CODEGEN_mad_susy_gg_t1t1 -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  @@ -580,17 +580,17 @@ INFO: Finding symmetric diagrams for subprocess group gg_t1t1x DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1588]  DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1589]  Generated helas calls for 1 subprocesses (6 diagrams) in 0.005 s -Wrote files for 16 helas calls in 1.460 s +Wrote files for 16 helas calls in 2.514 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.094 s +ALOHA: aloha creates 3 routines in 0.105 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.095 s +ALOHA: aloha creates 6 routines in 0.089 s VVV1 VSS1 VSS1 @@ -611,17 +611,17 @@ INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  +DEBUG: result.returncode =  0 [output.py at line 275]  Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. Type "launch" to generate events from this process, or see /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README Run "open index.html" to see more information about this process. quit -real 0m8.992s -user 0m1.869s -sys 0m0.755s -Code generation completed in 9 seconds +real 0m11.941s +user 0m1.873s +sys 0m0.654s +Code generation completed in 12 seconds /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc index 2ddd4b8cc9..0b6ddbf4aa 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -464,7 +461,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 VSS1_0( w_fp[4], w_fp[3], w_fp[2], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -481,7 +478,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VSS1_0( w_fp[1], w_fp[3], w_fp[4], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -497,7 +494,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 VSS1_0( w_fp[1], w_fp[3], w_fp[4], COUPs[3], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[3] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -513,7 +510,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VSS1_0( w_fp[1], w_fp[4], w_fp[2], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[4] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -529,7 +526,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 VSS1_0( w_fp[1], w_fp[2], w_fp[4], COUPs[3], -1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[5] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -834,9 +831,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -912,8 +908,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -975,25 +970,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1038,16 +1043,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1115,6 +1148,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1124,6 +1158,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1135,8 +1171,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1179,7 +1217,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1198,7 +1236,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1210,6 +1250,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1218,9 +1259,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1231,9 +1273,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1271,9 +1319,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1288,7 +1333,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1336,82 +1382,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1436,13 +1497,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1450,7 +1505,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h index 0c297072b2..bee85ff0b7 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt index 9b4adc99fd..ca077e5af2 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt @@ -550,31 +550,31 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.072 s +1 processes with 6 diagrams generated in 0.055 s Total: 1 processes with 6 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. -Generated helas calls for 1 subprocesses (6 diagrams) in 0.005 s +Generated helas calls for 1 subprocesses (6 diagrams) in 0.010 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.088 s +ALOHA: aloha creates 3 routines in 0.102 s VVV1 VSS1 VSS1 @@ -590,7 +590,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. quit -real 0m2.530s -user 0m0.769s -sys 0m0.160s -Code generation completed in 2 seconds +real 0m2.643s +user 0m0.729s +sys 0m0.132s +Code generation completed in 3 seconds diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc index 6950507444..3db12d56be 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -820,9 +817,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -898,8 +894,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -961,25 +956,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1024,16 +1029,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1101,6 +1134,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1110,6 +1144,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1121,8 +1157,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1165,7 +1203,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1184,7 +1222,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1196,6 +1236,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1204,9 +1245,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1217,9 +1259,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1257,9 +1305,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1274,7 +1319,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1322,82 +1368,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1422,13 +1483,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1436,7 +1491,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h index 0c297072b2..bee85ff0b7 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt index d22dd2464e..3ece1f2ceb 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt @@ -550,17 +550,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.093 s +1 processes with 3 diagrams generated in 0.077 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_susy_gg_tt INFO: remove old information in CODEGEN_mad_susy_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  @@ -579,16 +579,16 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttx DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1564]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1588]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1589]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s -Wrote files for 10 helas calls in 1.316 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +Wrote files for 10 helas calls in 1.977 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.076 s +ALOHA: aloha creates 2 routines in 0.081 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.062 s +ALOHA: aloha creates 4 routines in 0.072 s VVV1 FFV1 FFV1 @@ -608,17 +608,17 @@ INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  +DEBUG: result.returncode =  0 [output.py at line 275]  Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. Type "launch" to generate events from this process, or see /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m8.501s -user 0m1.813s -sys 0m0.727s -Code generation completed in 8 seconds +real 0m11.901s +user 0m1.762s +sys 0m0.693s +Code generation completed in 12 seconds /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index be603f5cda..14482e097b 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -455,7 +452,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], -1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[0] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -472,7 +469,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], -1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[1] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -488,7 +485,7 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], -1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId != 0 ) + if( storeChannelWeights ) { numerators_sv[2] += cxabs2( amp_sv[0] ); denominators_sv += cxabs2( amp_sv[0] ); @@ -805,9 +802,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -883,8 +879,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -946,25 +941,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1009,16 +1014,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1086,6 +1119,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1095,6 +1129,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1106,8 +1142,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1150,7 +1188,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1169,7 +1207,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1181,6 +1221,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1189,9 +1230,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1202,9 +1244,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1242,9 +1290,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1259,7 +1304,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1307,82 +1353,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1407,13 +1468,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1421,7 +1476,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 256c5780e4..99f978df4c 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 6416d0cc59..88fc5f557b 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -55,6 +55,9 @@ set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F import model MSSM_SLHA2 +INFO: load particles +INFO: load vertices +DEBUG: model prefixing takes 0.23334097862243652  INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Simplifying conditional expressions  @@ -550,30 +553,30 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.070 s +1 processes with 3 diagrams generated in 0.074 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. -Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.008 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.070 s +ALOHA: aloha creates 2 routines in 0.068 s VVV1 FFV1 FFV1 @@ -588,7 +591,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m2.256s -user 0m0.807s -sys 0m0.146s -Code generation completed in 2 seconds +real 0m3.608s +user 0m1.150s +sys 0m0.169s +Code generation completed in 3 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index 4790c980b3..63f1df1073 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -303,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -312,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -408,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -430,10 +431,6 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -793,9 +790,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -871,8 +867,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -934,25 +929,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -997,16 +1002,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1074,6 +1107,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1083,6 +1117,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1094,8 +1130,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1138,7 +1176,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1157,7 +1195,9 @@ namespace mg5amcCpu fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); for( int i = 0; i < processConfig::ndiagrams; ++i ) + { numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1169,6 +1209,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1177,9 +1218,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1190,9 +1232,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1230,9 +1278,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - unsigned int channelId = getChannelId( allChannelIds, ievt00 ); #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1247,7 +1292,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1295,82 +1341,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1395,13 +1456,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1409,7 +1464,7 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h index 256c5780e4..99f978df4c 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif From 35abb17bd92bdbc950228534510f12f378d8df04 Mon Sep 17 00:00:00 2001 From: Stefan Roiser Date: Thu, 11 Dec 2025 16:48:26 +0100 Subject: [PATCH 18/18] regenerate all processes --- .../SubProcesses/P1_epem_mupmum/umami.cc | 1 + .../SubProcesses/P1_epem_mupmum/umami.h | 1 + .../cudacpp/ee_mumu.mad/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../cudacpp/ee_mumu.mad/SubProcesses/umami.h | 212 +++++++ .../P1_Sigma_sm_epem_mupmum/umami.cc | 1 + .../P1_Sigma_sm_epem_mupmum/umami.h | 1 + .../cudacpp/ee_mumu.sa/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../cudacpp/ee_mumu.sa/SubProcesses/umami.h | 212 +++++++ .../gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc | 1 + .../gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h | 1 + .../cudacpp/gg_tt.mad/SubProcesses/umami.cc | 530 ++++++++++++++++++ epochX/cudacpp/gg_tt.mad/SubProcesses/umami.h | 212 +++++++ .../SubProcesses/P1_Sigma_sm_gg_ttx/umami.cc | 1 + .../SubProcesses/P1_Sigma_sm_gg_ttx/umami.h | 1 + epochX/cudacpp/gg_tt.sa/SubProcesses/umami.cc | 530 ++++++++++++++++++ epochX/cudacpp/gg_tt.sa/SubProcesses/umami.h | 212 +++++++ .../SubProcesses/P1_gg_ttx/umami.cc | 1 + .../SubProcesses/P1_gg_ttx/umami.h | 1 + .../SubProcesses/P2_gg_ttxg/umami.cc | 1 + .../SubProcesses/P2_gg_ttxg/umami.h | 1 + .../gg_tt01g.mad/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../cudacpp/gg_tt01g.mad/SubProcesses/umami.h | 212 +++++++ .../SubProcesses/P1_gg_ttxg/umami.cc | 1 + .../SubProcesses/P1_gg_ttxg/umami.h | 1 + .../cudacpp/gg_ttg.mad/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../cudacpp/gg_ttg.mad/SubProcesses/umami.h | 212 +++++++ .../SubProcesses/P1_Sigma_sm_gg_ttxg/umami.cc | 1 + .../SubProcesses/P1_Sigma_sm_gg_ttxg/umami.h | 1 + .../cudacpp/gg_ttg.sa/SubProcesses/umami.cc | 530 ++++++++++++++++++ epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.h | 212 +++++++ .../SubProcesses/P1_gg_ttxgg/umami.cc | 1 + .../SubProcesses/P1_gg_ttxgg/umami.h | 1 + .../cudacpp/gg_ttgg.mad/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../cudacpp/gg_ttgg.mad/SubProcesses/umami.h | 212 +++++++ .../P1_Sigma_sm_gg_ttxgg/umami.cc | 1 + .../SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.h | 1 + .../cudacpp/gg_ttgg.sa/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../cudacpp/gg_ttgg.sa/SubProcesses/umami.h | 212 +++++++ .../SubProcesses/P1_gg_ttxggg/umami.cc | 1 + .../SubProcesses/P1_gg_ttxggg/umami.h | 1 + .../gg_ttggg.mad/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../cudacpp/gg_ttggg.mad/SubProcesses/umami.h | 212 +++++++ .../P1_Sigma_sm_gg_ttxggg/umami.cc | 1 + .../P1_Sigma_sm_gg_ttxggg/umami.h | 1 + .../cudacpp/gg_ttggg.sa/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../cudacpp/gg_ttggg.sa/SubProcesses/umami.h | 212 +++++++ .../SubProcesses/P1_gu_ttxu/umami.cc | 1 + .../SubProcesses/P1_gu_ttxu/umami.h | 1 + .../SubProcesses/P1_gux_ttxux/umami.cc | 1 + .../SubProcesses/P1_gux_ttxux/umami.h | 1 + .../cudacpp/gq_ttq.mad/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../cudacpp/gq_ttq.mad/SubProcesses/umami.h | 212 +++++++ .../SubProcesses/P1_Sigma_sm_gu_ttxu/umami.cc | 1 + .../SubProcesses/P1_Sigma_sm_gu_ttxu/umami.h | 1 + .../P1_Sigma_sm_gux_ttxux/umami.cc | 1 + .../P1_Sigma_sm_gux_ttxux/umami.h | 1 + .../cudacpp/gq_ttq.sa/SubProcesses/umami.cc | 530 ++++++++++++++++++ epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.h | 212 +++++++ .../SubProcesses/P1_gg_bbx/umami.cc | 1 + .../SubProcesses/P1_gg_bbx/umami.h | 1 + .../heft_gg_bb.mad/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../heft_gg_bb.mad/SubProcesses/umami.h | 212 +++++++ .../P1_Sigma_heft_gg_bbx/umami.cc | 1 + .../SubProcesses/P1_Sigma_heft_gg_bbx/umami.h | 1 + .../heft_gg_bb.sa/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../heft_gg_bb.sa/SubProcesses/umami.h | 212 +++++++ .../SubProcesses/P0_dux_ttxwm/umami.cc | 1 + .../SubProcesses/P0_dux_ttxwm/umami.h | 1 + .../SubProcesses/P0_udx_ttxwp/umami.cc | 1 + .../SubProcesses/P0_udx_ttxwp/umami.h | 1 + .../SubProcesses/P1_dux_ttxwmg/umami.cc | 1 + .../SubProcesses/P1_dux_ttxwmg/umami.h | 1 + .../SubProcesses/P1_gd_ttxwmu/umami.cc | 1 + .../SubProcesses/P1_gd_ttxwmu/umami.h | 1 + .../SubProcesses/P1_gdx_ttxwpux/umami.cc | 1 + .../SubProcesses/P1_gdx_ttxwpux/umami.h | 1 + .../SubProcesses/P1_gu_ttxwpd/umami.cc | 1 + .../SubProcesses/P1_gu_ttxwpd/umami.h | 1 + .../SubProcesses/P1_gux_ttxwmdx/umami.cc | 1 + .../SubProcesses/P1_gux_ttxwmdx/umami.h | 1 + .../SubProcesses/P1_udx_ttxwpg/umami.cc | 1 + .../SubProcesses/P1_udx_ttxwpg/umami.h | 1 + .../nobm_pp_ttW.mad/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../nobm_pp_ttW.mad/SubProcesses/umami.h | 212 +++++++ .../SubProcesses/P0_gg_ttx/umami.cc | 1 + .../SubProcesses/P0_gg_ttx/umami.h | 1 + .../SubProcesses/P0_uux_ttx/umami.cc | 1 + .../SubProcesses/P0_uux_ttx/umami.h | 1 + .../SubProcesses/P1_gg_ttxg/umami.cc | 1 + .../SubProcesses/P1_gg_ttxg/umami.h | 1 + .../SubProcesses/P1_gu_ttxu/umami.cc | 1 + .../SubProcesses/P1_gu_ttxu/umami.h | 1 + .../SubProcesses/P1_gux_ttxux/umami.cc | 1 + .../SubProcesses/P1_gux_ttxux/umami.h | 1 + .../SubProcesses/P1_uux_ttxg/umami.cc | 1 + .../SubProcesses/P1_uux_ttxg/umami.h | 1 + .../SubProcesses/P2_gg_ttxgg/umami.cc | 1 + .../SubProcesses/P2_gg_ttxgg/umami.h | 1 + .../SubProcesses/P2_gg_ttxuux/umami.cc | 1 + .../SubProcesses/P2_gg_ttxuux/umami.h | 1 + .../SubProcesses/P2_gu_ttxgu/umami.cc | 1 + .../SubProcesses/P2_gu_ttxgu/umami.h | 1 + .../SubProcesses/P2_gux_ttxgux/umami.cc | 1 + .../SubProcesses/P2_gux_ttxgux/umami.h | 1 + .../SubProcesses/P2_uc_ttxuc/umami.cc | 1 + .../SubProcesses/P2_uc_ttxuc/umami.h | 1 + .../SubProcesses/P2_ucx_ttxucx/umami.cc | 1 + .../SubProcesses/P2_ucx_ttxucx/umami.h | 1 + .../SubProcesses/P2_uu_ttxuu/umami.cc | 1 + .../SubProcesses/P2_uu_ttxuu/umami.h | 1 + .../SubProcesses/P2_uux_ttxccx/umami.cc | 1 + .../SubProcesses/P2_uux_ttxccx/umami.h | 1 + .../SubProcesses/P2_uux_ttxgg/umami.cc | 1 + .../SubProcesses/P2_uux_ttxgg/umami.h | 1 + .../SubProcesses/P2_uux_ttxuux/umami.cc | 1 + .../SubProcesses/P2_uux_ttxuux/umami.h | 1 + .../SubProcesses/P2_uxcx_ttxuxcx/umami.cc | 1 + .../SubProcesses/P2_uxcx_ttxuxcx/umami.h | 1 + .../SubProcesses/P2_uxux_ttxuxux/umami.cc | 1 + .../SubProcesses/P2_uxux_ttxuxux/umami.h | 1 + .../pp_tt012j.mad/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../pp_tt012j.mad/SubProcesses/umami.h | 212 +++++++ .../SubProcesses/P1_gg_ttxttx/umami.cc | 1 + .../SubProcesses/P1_gg_ttxttx/umami.h | 1 + .../smeft_gg_tttt.mad/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../smeft_gg_tttt.mad/SubProcesses/umami.h | 212 +++++++ .../umami.cc | 1 + .../umami.h | 1 + .../smeft_gg_tttt.sa/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../smeft_gg_tttt.sa/SubProcesses/umami.h | 212 +++++++ .../SubProcesses/P1_gg_t1t1x/umami.cc | 1 + .../SubProcesses/P1_gg_t1t1x/umami.h | 1 + .../susy_gg_t1t1.mad/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../susy_gg_t1t1.mad/SubProcesses/umami.h | 212 +++++++ .../P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.cc | 1 + .../P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.h | 1 + .../susy_gg_t1t1.sa/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../susy_gg_t1t1.sa/SubProcesses/umami.h | 212 +++++++ .../SubProcesses/P1_gg_ttx/umami.cc | 1 + .../SubProcesses/P1_gg_ttx/umami.h | 1 + .../susy_gg_tt.mad/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../susy_gg_tt.mad/SubProcesses/umami.h | 212 +++++++ .../P1_Sigma_MSSM_SLHA2_gg_ttx/umami.cc | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/umami.h | 1 + .../susy_gg_tt.sa/SubProcesses/umami.cc | 530 ++++++++++++++++++ .../susy_gg_tt.sa/SubProcesses/umami.h | 212 +++++++ 146 files changed, 17166 insertions(+) create mode 120000 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.cc create mode 120000 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.h create mode 100644 epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.h create mode 120000 epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.cc create mode 120000 epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.h create mode 100644 epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.h create mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc create mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h create mode 100644 epochX/cudacpp/gg_tt.mad/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/gg_tt.mad/SubProcesses/umami.h create mode 120000 epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.cc create mode 120000 epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.h create mode 100644 epochX/cudacpp/gg_tt.sa/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/gg_tt.sa/SubProcesses/umami.h create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.cc create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.h create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.cc create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.h create mode 100644 epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.h create mode 120000 epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.cc create mode 120000 epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.h create mode 100644 epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.h create mode 120000 epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.cc create mode 120000 epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.h create mode 100644 epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.h create mode 120000 epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.cc create mode 120000 epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.h create mode 100644 epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.h create mode 120000 epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.cc create mode 120000 epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.h create mode 100644 epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.h create mode 120000 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.cc create mode 120000 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.h create mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.h create mode 120000 epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.cc create mode 120000 epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.h create mode 100644 epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.h create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.cc create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.h create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.cc create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.h create mode 100644 epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.h create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.cc create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.h create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.cc create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.h create mode 100644 epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.h create mode 120000 epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.cc create mode 120000 epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.h create mode 100644 epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.h create mode 120000 epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.cc create mode 120000 epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.h create mode 100644 epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.h create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.cc create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.h create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.cc create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.h create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.cc create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.h create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.cc create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.h create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.cc create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.h create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.cc create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.h create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.cc create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.h create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.cc create mode 120000 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.h create mode 100644 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.cc create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.h create mode 120000 epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.cc create mode 120000 epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.h create mode 100644 epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.h create mode 120000 epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.cc create mode 120000 epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.h create mode 100644 epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.h create mode 120000 epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.cc create mode 120000 epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.h create mode 100644 epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.h create mode 120000 epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.cc create mode 120000 epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.h create mode 100644 epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.h create mode 120000 epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc create mode 120000 epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h create mode 100644 epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.cc create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.h diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER